Update edge_tts to v6.1.3

* Sort voices in util's --list-voices
* Disable overlapping subtitles by default
* Try to fix subtitles for many hour long TTS generation
* Match Microsoft Edge connection behavior more
* Bump edge_tts version
This commit is contained in:
rany2
2023-01-25 18:29:40 +02:00
parent 85eef7d918
commit 58307ded26
6 changed files with 56 additions and 26 deletions

View File

@@ -1,2 +1,4 @@
#!/bin/sh
set -ux
find src examples -name '*.py' | xargs pylint find src examples -name '*.py' | xargs pylint
find src examples -name '*.py' | xargs mypy find src examples -name '*.py' | xargs mypy

View File

@@ -283,10 +283,10 @@ max-attributes=7
max-bool-expr=5 max-bool-expr=5
# Maximum number of branch for function / method body. # Maximum number of branch for function / method body.
max-branches=15 max-branches=16
# Maximum number of locals for function / method body. # Maximum number of locals for function / method body.
max-locals=15 max-locals=18
# Maximum number of parents for a class (see R0901). # Maximum number of parents for a class (see R0901).
max-parents=7 max-parents=7

View File

@@ -254,9 +254,15 @@ class Communicate:
self.voice: str = voice self.voice: str = voice
match = re.match(r"^([a-z]{2})-([A-Z]{2})-(.+Neural)$", voice) match = re.match(r"^([a-z]{2})-([A-Z]{2})-(.+Neural)$", voice)
if match is not None: if match is not None:
lang = match.group(1)
region = match.group(2)
name = match.group(3)
if name.find("-") != -1:
region = region + "-" + name[: name.find("-")]
name = name[name.find("-") + 1 :]
self.voice = ( self.voice = (
"Microsoft Server Speech Text to Speech Voice" "Microsoft Server Speech Text to Speech Voice"
+ f" ({match.group(1)}-{match.group(2)}, {match.group(3)})" + f" ({lang}-{region}, {name})"
) )
if ( if (
@@ -291,24 +297,29 @@ class Communicate:
escape(remove_incompatible_characters(self.text)), escape(remove_incompatible_characters(self.text)),
calc_max_mesg_size(self.voice, self.rate, self.volume), calc_max_mesg_size(self.voice, self.rate, self.volume),
) )
final_utterance: Dict[int, int] = {}
prev_idx = -1
shift_time = -1
async with aiohttp.ClientSession(trust_env=True) as session, session.ws_connect( for idx, text in enumerate(texts):
f"{WSS_URL}&ConnectionId={connect_id()}", async with aiohttp.ClientSession(
compress=15, trust_env=True
autoclose=True, ) as session, session.ws_connect(
autoping=True, f"{WSS_URL}&ConnectionId={connect_id()}",
proxy=self.proxy, compress=15,
headers={ autoclose=True,
"Pragma": "no-cache", autoping=True,
"Cache-Control": "no-cache", proxy=self.proxy,
"Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold", headers={
"Accept-Encoding": "gzip, deflate, br", "Pragma": "no-cache",
"Accept-Language": "en-US,en;q=0.9", "Cache-Control": "no-cache",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
" (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41", "Accept-Encoding": "gzip, deflate, br",
}, "Accept-Language": "en-US,en;q=0.9",
) as websocket: "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
for text in texts: " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
},
) as websocket:
# download indicates whether we should be expecting audio data, # download indicates whether we should be expecting audio data,
# this is so what we avoid getting binary data from the websocket # this is so what we avoid getting binary data from the websocket
# and falsely thinking it's audio data. # and falsely thinking it's audio data.
@@ -362,10 +373,25 @@ class Communicate:
elif path == b"audio.metadata": elif path == b"audio.metadata":
for meta_obj in json.loads(data)["Metadata"]: for meta_obj in json.loads(data)["Metadata"]:
meta_type = meta_obj["Type"] meta_type = meta_obj["Type"]
if idx != prev_idx:
shift_time = sum(
final_utterance[i] for i in range(idx)
)
prev_idx = idx
if meta_type == "WordBoundary": if meta_type == "WordBoundary":
final_utterance[idx] = (
meta_obj["Data"]["Offset"]
+ meta_obj["Data"]["Duration"]
# Average padding added by the service
# Alternatively we could use ffmpeg to get value properly
# but I don't want to add an additional dependency
# if this is found to work well enough.
+ 8_750_000
)
yield { yield {
"type": meta_type, "type": meta_type,
"offset": meta_obj["Data"]["Offset"], "offset": meta_obj["Data"]["Offset"]
+ shift_time,
"duration": meta_obj["Data"]["Duration"], "duration": meta_obj["Data"]["Duration"],
"text": meta_obj["Data"]["text"]["Text"], "text": meta_obj["Data"]["text"]["Text"],
} }

View File

@@ -40,7 +40,7 @@ class SubMaker:
SubMaker class SubMaker class
""" """
def __init__(self, overlapping: int = 1) -> None: def __init__(self, overlapping: int = 0) -> None:
""" """
SubMaker constructor. SubMaker constructor.

View File

@@ -14,7 +14,9 @@ from edge_tts import Communicate, SubMaker, list_voices
async def _print_voices(*, proxy: str) -> None: async def _print_voices(*, proxy: str) -> None:
"""Print all available voices.""" """Print all available voices."""
for idx, voice in enumerate(await list_voices(proxy=proxy)): voices = await list_voices(proxy=proxy)
voices = sorted(voices, key=lambda voice: voice["ShortName"]) # type: ignore
for idx, voice in enumerate(voices):
if idx != 0: if idx != 0:
print() print()
@@ -82,8 +84,8 @@ async def _async_main() -> None:
parser.add_argument( parser.add_argument(
"-O", "-O",
"--overlapping", "--overlapping",
help="overlapping subtitles in seconds", help="overlapping subtitles in seconds. Default: 0.",
default=1, default=0,
type=float, type=float,
) )
parser.add_argument( parser.add_argument(

View File

@@ -1,4 +1,4 @@
"""Edge TTS version information.""" """Edge TTS version information."""
__version__ = "6.1.1" __version__ = "6.1.3"
__version_info__ = tuple(int(num) for num in __version__.split(".")) __version_info__ = tuple(int(num) for num in __version__.split("."))