Add option to configure SentenceBoundary (#348)
* Create FUNDING.yml * add setence boundary which is more friendly to Chinese users --------- Co-authored-by: Rany <ranygh@riseup.net>
This commit is contained in:
@@ -16,14 +16,17 @@ from typing import (
|
||||
Dict,
|
||||
Generator,
|
||||
List,
|
||||
Literal,
|
||||
Optional,
|
||||
Tuple,
|
||||
TypedDict,
|
||||
Union,
|
||||
)
|
||||
from xml.sax.saxutils import escape, unescape
|
||||
|
||||
import aiohttp
|
||||
import certifi
|
||||
from typing_extensions import NotRequired, Unpack
|
||||
|
||||
from .constants import DEFAULT_VOICE, SEC_MS_GEC_VERSION, WSS_HEADERS, WSS_URL
|
||||
from .data_classes import TTSConfig
|
||||
@@ -328,6 +331,14 @@ def calc_max_mesg_size(tts_config: TTSConfig) -> int:
|
||||
return websocket_max_size - overhead_per_message
|
||||
|
||||
|
||||
class CommunicateRequest(TypedDict):
|
||||
"""
|
||||
A class to communicate with the service.
|
||||
"""
|
||||
|
||||
Boundary: NotRequired[Literal["WordBoundary", "SentenceBoundary"]]
|
||||
|
||||
|
||||
class Communicate:
|
||||
"""
|
||||
Communicate with the service.
|
||||
@@ -345,9 +356,21 @@ class Communicate:
|
||||
proxy: Optional[str] = None,
|
||||
connect_timeout: Optional[int] = 10,
|
||||
receive_timeout: Optional[int] = 60,
|
||||
**kwargs: Unpack[CommunicateRequest],
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
boundary (str): The boundary to use for the TTS.
|
||||
Defaults to "WordBoundary".
|
||||
Valid values are "WordBoundary" and "SentenceBoundary".
|
||||
If "WordBoundary", the TTS will return a word boundary for each word.
|
||||
If "SentenceBoundary", the TTS will return a sentence boundary for each sentence.
|
||||
Which is more friendly to Chinese users.
|
||||
"""
|
||||
|
||||
# Validate TTS settings and store the TTSConfig object.
|
||||
self.tts_config = TTSConfig(voice, rate, volume, pitch)
|
||||
boundary = kwargs.get("Boundary", "WordBoundary")
|
||||
self.tts_config = TTSConfig(voice, rate, volume, pitch, boundary)
|
||||
|
||||
# Validate the text parameter.
|
||||
if not isinstance(text, str):
|
||||
@@ -392,7 +415,7 @@ class Communicate:
|
||||
def __parse_metadata(self, data: bytes) -> TTSChunk:
|
||||
for meta_obj in json.loads(data)["Metadata"]:
|
||||
meta_type = meta_obj["Type"]
|
||||
if meta_type == "WordBoundary":
|
||||
if meta_type in ("WordBoundary", "SentenceBoundary"):
|
||||
current_offset = (
|
||||
meta_obj["Data"]["Offset"] + self.state["offset_compensation"]
|
||||
)
|
||||
@@ -411,12 +434,16 @@ class Communicate:
|
||||
async def __stream(self) -> AsyncGenerator[TTSChunk, None]:
|
||||
async def send_command_request() -> None:
|
||||
"""Sends the command request to the service."""
|
||||
word_boundary = self.tts_config.boundary == "WordBoundary"
|
||||
wd = "true" if word_boundary else "false"
|
||||
sq = "true" if not word_boundary else "false"
|
||||
await websocket.send_str(
|
||||
f"X-Timestamp:{date_to_string()}\r\n"
|
||||
"Content-Type:application/json; charset=utf-8\r\n"
|
||||
"Path:speech.config\r\n\r\n"
|
||||
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
|
||||
'"sentenceBoundaryEnabled":"false","wordBoundaryEnabled":"true"},'
|
||||
f'"sentenceBoundaryEnabled":"{sq}","wordBoundaryEnabled":"{wd}"'
|
||||
"},"
|
||||
'"outputFormat":"audio-24khz-48kbitrate-mono-mp3"'
|
||||
"}}}}\r\n"
|
||||
)
|
||||
@@ -603,9 +630,9 @@ class Communicate:
|
||||
async for message in self.stream():
|
||||
if message["type"] == "audio":
|
||||
audio.write(message["data"])
|
||||
elif (
|
||||
isinstance(metadata, TextIOWrapper)
|
||||
and message["type"] == "WordBoundary"
|
||||
elif isinstance(metadata, TextIOWrapper) and message["type"] in (
|
||||
"WordBoundary",
|
||||
"SentenceBoundary",
|
||||
):
|
||||
json.dump(message, metadata)
|
||||
metadata.write("\n")
|
||||
|
||||
Reference in New Issue
Block a user