Update edge_tts to v6.1.3
* Sort voices in util's --list-voices * Disable overlapping subtitles by default * Try to fix subtitles for many hour long TTS generation * Match Microsoft Edge connection behavior more * Bump edge_tts version
This commit is contained in:
2
lint.sh
2
lint.sh
@@ -1,2 +1,4 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
set -ux
|
||||||
find src examples -name '*.py' | xargs pylint
|
find src examples -name '*.py' | xargs pylint
|
||||||
find src examples -name '*.py' | xargs mypy
|
find src examples -name '*.py' | xargs mypy
|
||||||
|
|||||||
4
pylintrc
4
pylintrc
@@ -283,10 +283,10 @@ max-attributes=7
|
|||||||
max-bool-expr=5
|
max-bool-expr=5
|
||||||
|
|
||||||
# Maximum number of branch for function / method body.
|
# Maximum number of branch for function / method body.
|
||||||
max-branches=15
|
max-branches=16
|
||||||
|
|
||||||
# Maximum number of locals for function / method body.
|
# Maximum number of locals for function / method body.
|
||||||
max-locals=15
|
max-locals=18
|
||||||
|
|
||||||
# Maximum number of parents for a class (see R0901).
|
# Maximum number of parents for a class (see R0901).
|
||||||
max-parents=7
|
max-parents=7
|
||||||
|
|||||||
@@ -254,9 +254,15 @@ class Communicate:
|
|||||||
self.voice: str = voice
|
self.voice: str = voice
|
||||||
match = re.match(r"^([a-z]{2})-([A-Z]{2})-(.+Neural)$", voice)
|
match = re.match(r"^([a-z]{2})-([A-Z]{2})-(.+Neural)$", voice)
|
||||||
if match is not None:
|
if match is not None:
|
||||||
|
lang = match.group(1)
|
||||||
|
region = match.group(2)
|
||||||
|
name = match.group(3)
|
||||||
|
if name.find("-") != -1:
|
||||||
|
region = region + "-" + name[: name.find("-")]
|
||||||
|
name = name[name.find("-") + 1 :]
|
||||||
self.voice = (
|
self.voice = (
|
||||||
"Microsoft Server Speech Text to Speech Voice"
|
"Microsoft Server Speech Text to Speech Voice"
|
||||||
+ f" ({match.group(1)}-{match.group(2)}, {match.group(3)})"
|
+ f" ({lang}-{region}, {name})"
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -291,24 +297,29 @@ class Communicate:
|
|||||||
escape(remove_incompatible_characters(self.text)),
|
escape(remove_incompatible_characters(self.text)),
|
||||||
calc_max_mesg_size(self.voice, self.rate, self.volume),
|
calc_max_mesg_size(self.voice, self.rate, self.volume),
|
||||||
)
|
)
|
||||||
|
final_utterance: Dict[int, int] = {}
|
||||||
|
prev_idx = -1
|
||||||
|
shift_time = -1
|
||||||
|
|
||||||
async with aiohttp.ClientSession(trust_env=True) as session, session.ws_connect(
|
for idx, text in enumerate(texts):
|
||||||
f"{WSS_URL}&ConnectionId={connect_id()}",
|
async with aiohttp.ClientSession(
|
||||||
compress=15,
|
trust_env=True
|
||||||
autoclose=True,
|
) as session, session.ws_connect(
|
||||||
autoping=True,
|
f"{WSS_URL}&ConnectionId={connect_id()}",
|
||||||
proxy=self.proxy,
|
compress=15,
|
||||||
headers={
|
autoclose=True,
|
||||||
"Pragma": "no-cache",
|
autoping=True,
|
||||||
"Cache-Control": "no-cache",
|
proxy=self.proxy,
|
||||||
"Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
|
headers={
|
||||||
"Accept-Encoding": "gzip, deflate, br",
|
"Pragma": "no-cache",
|
||||||
"Accept-Language": "en-US,en;q=0.9",
|
"Cache-Control": "no-cache",
|
||||||
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
"Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
|
||||||
" (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
|
"Accept-Encoding": "gzip, deflate, br",
|
||||||
},
|
"Accept-Language": "en-US,en;q=0.9",
|
||||||
) as websocket:
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
||||||
for text in texts:
|
" (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
|
||||||
|
},
|
||||||
|
) as websocket:
|
||||||
# download indicates whether we should be expecting audio data,
|
# download indicates whether we should be expecting audio data,
|
||||||
# this is so what we avoid getting binary data from the websocket
|
# this is so what we avoid getting binary data from the websocket
|
||||||
# and falsely thinking it's audio data.
|
# and falsely thinking it's audio data.
|
||||||
@@ -362,10 +373,25 @@ class Communicate:
|
|||||||
elif path == b"audio.metadata":
|
elif path == b"audio.metadata":
|
||||||
for meta_obj in json.loads(data)["Metadata"]:
|
for meta_obj in json.loads(data)["Metadata"]:
|
||||||
meta_type = meta_obj["Type"]
|
meta_type = meta_obj["Type"]
|
||||||
|
if idx != prev_idx:
|
||||||
|
shift_time = sum(
|
||||||
|
final_utterance[i] for i in range(idx)
|
||||||
|
)
|
||||||
|
prev_idx = idx
|
||||||
if meta_type == "WordBoundary":
|
if meta_type == "WordBoundary":
|
||||||
|
final_utterance[idx] = (
|
||||||
|
meta_obj["Data"]["Offset"]
|
||||||
|
+ meta_obj["Data"]["Duration"]
|
||||||
|
# Average padding added by the service
|
||||||
|
# Alternatively we could use ffmpeg to get value properly
|
||||||
|
# but I don't want to add an additional dependency
|
||||||
|
# if this is found to work well enough.
|
||||||
|
+ 8_750_000
|
||||||
|
)
|
||||||
yield {
|
yield {
|
||||||
"type": meta_type,
|
"type": meta_type,
|
||||||
"offset": meta_obj["Data"]["Offset"],
|
"offset": meta_obj["Data"]["Offset"]
|
||||||
|
+ shift_time,
|
||||||
"duration": meta_obj["Data"]["Duration"],
|
"duration": meta_obj["Data"]["Duration"],
|
||||||
"text": meta_obj["Data"]["text"]["Text"],
|
"text": meta_obj["Data"]["text"]["Text"],
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ class SubMaker:
|
|||||||
SubMaker class
|
SubMaker class
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, overlapping: int = 1) -> None:
|
def __init__(self, overlapping: int = 0) -> None:
|
||||||
"""
|
"""
|
||||||
SubMaker constructor.
|
SubMaker constructor.
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,9 @@ from edge_tts import Communicate, SubMaker, list_voices
|
|||||||
|
|
||||||
async def _print_voices(*, proxy: str) -> None:
|
async def _print_voices(*, proxy: str) -> None:
|
||||||
"""Print all available voices."""
|
"""Print all available voices."""
|
||||||
for idx, voice in enumerate(await list_voices(proxy=proxy)):
|
voices = await list_voices(proxy=proxy)
|
||||||
|
voices = sorted(voices, key=lambda voice: voice["ShortName"]) # type: ignore
|
||||||
|
for idx, voice in enumerate(voices):
|
||||||
if idx != 0:
|
if idx != 0:
|
||||||
print()
|
print()
|
||||||
|
|
||||||
@@ -82,8 +84,8 @@ async def _async_main() -> None:
|
|||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-O",
|
"-O",
|
||||||
"--overlapping",
|
"--overlapping",
|
||||||
help="overlapping subtitles in seconds",
|
help="overlapping subtitles in seconds. Default: 0.",
|
||||||
default=1,
|
default=0,
|
||||||
type=float,
|
type=float,
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
"""Edge TTS version information."""
|
"""Edge TTS version information."""
|
||||||
|
|
||||||
__version__ = "6.1.1"
|
__version__ = "6.1.3"
|
||||||
__version_info__ = tuple(int(num) for num in __version__.split("."))
|
__version_info__ = tuple(int(num) for num in __version__.split("."))
|
||||||
|
|||||||
Reference in New Issue
Block a user