Add option to configure SentenceBoundary (#348)
* Create FUNDING.yml * add setence boundary which is more friendly to Chinese users --------- Co-authored-by: Rany <ranygh@riseup.net>
This commit is contained in:
1
.github/FUNDING.yml
vendored
Normal file
1
.github/FUNDING.yml
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
github: rany2
|
||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -162,3 +162,4 @@ cython_debug/
|
|||||||
# Edge-TTS specific ignores
|
# Edge-TTS specific ignores
|
||||||
*.mp3
|
*.mp3
|
||||||
*.srt
|
*.srt
|
||||||
|
/.idea/
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ async def amain() -> None:
|
|||||||
async for chunk in communicate.stream():
|
async for chunk in communicate.stream():
|
||||||
if chunk["type"] == "audio":
|
if chunk["type"] == "audio":
|
||||||
file.write(chunk["data"])
|
file.write(chunk["data"])
|
||||||
elif chunk["type"] == "WordBoundary":
|
elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
|
||||||
submaker.feed(chunk)
|
submaker.feed(chunk)
|
||||||
|
|
||||||
with open(SRT_FILE, "w", encoding="utf-8") as file:
|
with open(SRT_FILE, "w", encoding="utf-8") as file:
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def main() -> None:
|
|||||||
for chunk in communicate.stream_sync():
|
for chunk in communicate.stream_sync():
|
||||||
if chunk["type"] == "audio":
|
if chunk["type"] == "audio":
|
||||||
file.write(chunk["data"])
|
file.write(chunk["data"])
|
||||||
elif chunk["type"] == "WordBoundary":
|
elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
|
||||||
submaker.feed(chunk)
|
submaker.feed(chunk)
|
||||||
|
|
||||||
with open(SRT_FILE, "w", encoding="utf-8") as file:
|
with open(SRT_FILE, "w", encoding="utf-8") as file:
|
||||||
|
|||||||
@@ -0,0 +1,42 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
"""Sync variant of the async .stream() method to
|
||||||
|
get audio chunks and feed them to SubMaker to
|
||||||
|
generate subtitles"""
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import edge_tts
|
||||||
|
|
||||||
|
TEXT = """君不见,黄河之水天上来,奔流到海不复回。
|
||||||
|
君不见,高堂明镜悲白发,朝如青丝暮成雪。
|
||||||
|
人生得意须尽欢,莫使金樽空对月。
|
||||||
|
天生我材必有用,千金散尽还复来。
|
||||||
|
烹羊宰牛且为乐,会须一饮三百杯。
|
||||||
|
岑夫子,丹丘生,将进酒,杯莫停。
|
||||||
|
与君歌一曲,请君为我倾耳听。
|
||||||
|
钟鼓馔玉不足贵,但愿长醉不复醒。
|
||||||
|
古来圣贤皆寂寞,惟有饮者留其名。
|
||||||
|
陈王昔时宴平乐,斗酒十千恣欢谑。
|
||||||
|
主人何为言少钱,径须沽取对君酌。
|
||||||
|
五花马,千金裘,呼儿将出换美酒,与尔同销万古愁。"""
|
||||||
|
VOICE = "zh-CN-YunjianNeural"
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Main function"""
|
||||||
|
communicate = edge_tts.Communicate(TEXT, VOICE, Boundary="SentenceBoundary")
|
||||||
|
submaker = edge_tts.SubMaker()
|
||||||
|
stdout = sys.stdout
|
||||||
|
audio_bytes = []
|
||||||
|
for chunk in communicate.stream_sync():
|
||||||
|
if chunk["type"] == "audio":
|
||||||
|
audio_bytes.append(chunk["data"])
|
||||||
|
elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
|
||||||
|
submaker.feed(chunk)
|
||||||
|
|
||||||
|
stdout.write(f"audio file length: {len(audio_bytes)}")
|
||||||
|
stdout.write(submaker.get_srt())
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
@@ -16,14 +16,17 @@ from typing import (
|
|||||||
Dict,
|
Dict,
|
||||||
Generator,
|
Generator,
|
||||||
List,
|
List,
|
||||||
|
Literal,
|
||||||
Optional,
|
Optional,
|
||||||
Tuple,
|
Tuple,
|
||||||
|
TypedDict,
|
||||||
Union,
|
Union,
|
||||||
)
|
)
|
||||||
from xml.sax.saxutils import escape, unescape
|
from xml.sax.saxutils import escape, unescape
|
||||||
|
|
||||||
import aiohttp
|
import aiohttp
|
||||||
import certifi
|
import certifi
|
||||||
|
from typing_extensions import NotRequired, Unpack
|
||||||
|
|
||||||
from .constants import DEFAULT_VOICE, SEC_MS_GEC_VERSION, WSS_HEADERS, WSS_URL
|
from .constants import DEFAULT_VOICE, SEC_MS_GEC_VERSION, WSS_HEADERS, WSS_URL
|
||||||
from .data_classes import TTSConfig
|
from .data_classes import TTSConfig
|
||||||
@@ -328,6 +331,14 @@ def calc_max_mesg_size(tts_config: TTSConfig) -> int:
|
|||||||
return websocket_max_size - overhead_per_message
|
return websocket_max_size - overhead_per_message
|
||||||
|
|
||||||
|
|
||||||
|
class CommunicateRequest(TypedDict):
|
||||||
|
"""
|
||||||
|
A class to communicate with the service.
|
||||||
|
"""
|
||||||
|
|
||||||
|
Boundary: NotRequired[Literal["WordBoundary", "SentenceBoundary"]]
|
||||||
|
|
||||||
|
|
||||||
class Communicate:
|
class Communicate:
|
||||||
"""
|
"""
|
||||||
Communicate with the service.
|
Communicate with the service.
|
||||||
@@ -345,9 +356,21 @@ class Communicate:
|
|||||||
proxy: Optional[str] = None,
|
proxy: Optional[str] = None,
|
||||||
connect_timeout: Optional[int] = 10,
|
connect_timeout: Optional[int] = 10,
|
||||||
receive_timeout: Optional[int] = 60,
|
receive_timeout: Optional[int] = 60,
|
||||||
|
**kwargs: Unpack[CommunicateRequest],
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Args:
|
||||||
|
boundary (str): The boundary to use for the TTS.
|
||||||
|
Defaults to "WordBoundary".
|
||||||
|
Valid values are "WordBoundary" and "SentenceBoundary".
|
||||||
|
If "WordBoundary", the TTS will return a word boundary for each word.
|
||||||
|
If "SentenceBoundary", the TTS will return a sentence boundary for each sentence.
|
||||||
|
Which is more friendly to Chinese users.
|
||||||
|
"""
|
||||||
|
|
||||||
# Validate TTS settings and store the TTSConfig object.
|
# Validate TTS settings and store the TTSConfig object.
|
||||||
self.tts_config = TTSConfig(voice, rate, volume, pitch)
|
boundary = kwargs.get("Boundary", "WordBoundary")
|
||||||
|
self.tts_config = TTSConfig(voice, rate, volume, pitch, boundary)
|
||||||
|
|
||||||
# Validate the text parameter.
|
# Validate the text parameter.
|
||||||
if not isinstance(text, str):
|
if not isinstance(text, str):
|
||||||
@@ -392,7 +415,7 @@ class Communicate:
|
|||||||
def __parse_metadata(self, data: bytes) -> TTSChunk:
|
def __parse_metadata(self, data: bytes) -> TTSChunk:
|
||||||
for meta_obj in json.loads(data)["Metadata"]:
|
for meta_obj in json.loads(data)["Metadata"]:
|
||||||
meta_type = meta_obj["Type"]
|
meta_type = meta_obj["Type"]
|
||||||
if meta_type == "WordBoundary":
|
if meta_type in ("WordBoundary", "SentenceBoundary"):
|
||||||
current_offset = (
|
current_offset = (
|
||||||
meta_obj["Data"]["Offset"] + self.state["offset_compensation"]
|
meta_obj["Data"]["Offset"] + self.state["offset_compensation"]
|
||||||
)
|
)
|
||||||
@@ -411,12 +434,16 @@ class Communicate:
|
|||||||
async def __stream(self) -> AsyncGenerator[TTSChunk, None]:
|
async def __stream(self) -> AsyncGenerator[TTSChunk, None]:
|
||||||
async def send_command_request() -> None:
|
async def send_command_request() -> None:
|
||||||
"""Sends the command request to the service."""
|
"""Sends the command request to the service."""
|
||||||
|
word_boundary = self.tts_config.boundary == "WordBoundary"
|
||||||
|
wd = "true" if word_boundary else "false"
|
||||||
|
sq = "true" if not word_boundary else "false"
|
||||||
await websocket.send_str(
|
await websocket.send_str(
|
||||||
f"X-Timestamp:{date_to_string()}\r\n"
|
f"X-Timestamp:{date_to_string()}\r\n"
|
||||||
"Content-Type:application/json; charset=utf-8\r\n"
|
"Content-Type:application/json; charset=utf-8\r\n"
|
||||||
"Path:speech.config\r\n\r\n"
|
"Path:speech.config\r\n\r\n"
|
||||||
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
|
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
|
||||||
'"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},'
|
f'"sentenceBoundaryEnabled":"{sq}","wordBoundaryEnabled":"{wd}"'
|
||||||
|
"},"
|
||||||
'"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
|
'"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
|
||||||
"}}}}\r\n"
|
"}}}}\r\n"
|
||||||
)
|
)
|
||||||
@@ -603,9 +630,9 @@ class Communicate:
|
|||||||
async for message in self.stream():
|
async for message in self.stream():
|
||||||
if message["type"] == "audio":
|
if message["type"] == "audio":
|
||||||
audio.write(message["data"])
|
audio.write(message["data"])
|
||||||
elif (
|
elif isinstance(metadata, TextIOWrapper) and message["type"] in (
|
||||||
isinstance(metadata, TextIOWrapper)
|
"WordBoundary",
|
||||||
and message["type"] == "WordBoundary"
|
"SentenceBoundary",
|
||||||
):
|
):
|
||||||
json.dump(message, metadata)
|
json.dump(message, metadata)
|
||||||
metadata.write("\n")
|
metadata.write("\n")
|
||||||
|
|||||||
@@ -17,6 +17,7 @@ class TTSConfig:
|
|||||||
rate: str
|
rate: str
|
||||||
volume: str
|
volume: str
|
||||||
pitch: str
|
pitch: str
|
||||||
|
boundary: str
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def validate_string_param(param_name: str, param_value: str, pattern: str) -> str:
|
def validate_string_param(param_name: str, param_value: str, pattern: str) -> str:
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
"""SubMaker module is used to generate subtitles from WordBoundary events."""
|
"""SubMaker module is used to generate subtitles from WordBoundary and SentenceBoundary events."""
|
||||||
|
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
@@ -9,7 +9,7 @@ from .typing import TTSChunk
|
|||||||
|
|
||||||
class SubMaker:
|
class SubMaker:
|
||||||
"""
|
"""
|
||||||
SubMaker is used to generate subtitles from WordBoundary messages.
|
SubMaker is used to generate subtitles from WordBoundary and SentenceBoundary messages.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
@@ -17,15 +17,15 @@ class SubMaker:
|
|||||||
|
|
||||||
def feed(self, msg: TTSChunk) -> None:
|
def feed(self, msg: TTSChunk) -> None:
|
||||||
"""
|
"""
|
||||||
Feed a WordBoundary message to the SubMaker object.
|
Feed a WordBoundary or SentenceBoundary message to the SubMaker object.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
msg (dict): The WordBoundary message.
|
msg (dict): The WordBoundary or SentenceBoundary message.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
if msg["type"] != "WordBoundary":
|
if msg["type"] not in ("WordBoundary", "SentenceBoundary"):
|
||||||
raise ValueError("Invalid message type, expected 'WordBoundary'")
|
raise ValueError("Invalid message type, expected 'WordBoundary'")
|
||||||
|
|
||||||
self.cues.append(
|
self.cues.append(
|
||||||
|
|||||||
@@ -10,11 +10,11 @@ from typing_extensions import Literal, NotRequired, TypedDict
|
|||||||
class TTSChunk(TypedDict):
|
class TTSChunk(TypedDict):
|
||||||
"""TTS chunk data."""
|
"""TTS chunk data."""
|
||||||
|
|
||||||
type: Literal["audio", "WordBoundary"]
|
type: Literal["audio", "WordBoundary", "SentenceBoundary"]
|
||||||
data: NotRequired[bytes] # only for audio
|
data: NotRequired[bytes] # only for audio
|
||||||
duration: NotRequired[float] # only for WordBoundary
|
duration: NotRequired[float] # only for WordBoundary and SentenceBoundary
|
||||||
offset: NotRequired[float] # only for WordBoundary
|
offset: NotRequired[float] # only for WordBoundary and SentenceBoundary
|
||||||
text: NotRequired[str] # only for WordBoundary
|
text: NotRequired[str] # only for WordBoundary and SentenceBoundary
|
||||||
|
|
||||||
|
|
||||||
class VoiceTag(TypedDict):
|
class VoiceTag(TypedDict):
|
||||||
|
|||||||
@@ -72,7 +72,7 @@ async def _run_tts(args: UtilArgs) -> None:
|
|||||||
async for chunk in communicate.stream():
|
async for chunk in communicate.stream():
|
||||||
if chunk["type"] == "audio":
|
if chunk["type"] == "audio":
|
||||||
audio_file.write(chunk["data"])
|
audio_file.write(chunk["data"])
|
||||||
elif chunk["type"] == "WordBoundary":
|
elif chunk["type"] in ("WordBoundary", "SentenceBoundary"):
|
||||||
submaker.feed(chunk)
|
submaker.feed(chunk)
|
||||||
|
|
||||||
if args.words_in_cue > 0:
|
if args.words_in_cue > 0:
|
||||||
|
|||||||
Reference in New Issue
Block a user