Update edge-tts to version 4.0.11

* Add pylint check for lint.sh
* Change overlapping default from 5second to 1second for SubMaker and util.py
* Default to WordBoundary for edge-playback (from SentenceBoundary)
* Drop SentenceBoundary support (never works properly and too many hacks)
  - No longer actually supported by Azure's official SDK for a few years
    already
* Stop attempting to correct broken offsets sent back to us by Azure
  - Fixes never work properly because AI voice sometimes takes odd pauses
    at the start and sometimes doesn't do so. It's never predictable and
    cannot be fixed on the library's end.
  - Solution is for Microsoft to fix the integer overflow bug they are facing
    in the {Word,Sentence}Boundary offsets. It doesn't affect us until we reach
    30min long TTS anyway.
* Have edge-tts --list-voices use the configured HTTP proxy
* More misc changes and fixes
This commit is contained in:
rany2
2022-05-29 18:08:16 +03:00
parent 9a20f1ca90
commit 797d04f182
7 changed files with 53 additions and 67 deletions

View File

@@ -1,2 +1,3 @@
find src examples -name '*.py' | xargs black
find src examples -name '*.py' | xargs isort
find src examples -name '*.py' | xargs pylint

View File

@@ -1,6 +1,6 @@
[metadata]
name = edge-tts
version = 4.0.10
version = 4.0.11
author = rany
author_email = ranygh@riseup.net
description = Microsoft Edge's TTS

View File

@@ -28,7 +28,7 @@ def main():
with subprocess.Popen(
[
"edge-tts",
"--boundary-type=2",
"--boundary-type=1",
f"--write-media={media.name}",
f"--write-subtitles={subtitle.name}",
]

View File

@@ -55,8 +55,8 @@ def remove_incompatible_characters(string):
string = list(string)
for idx in range(len(string)): # pylint: disable=consider-using-enumerate
code = ord(string[idx])
for idx, char in enumerate(string):
code = ord(char)
if (0 <= code <= 8) or (11 <= code <= 12) or (14 <= code <= 31):
string[idx] = " "
@@ -193,7 +193,7 @@ def ssml_headers_plus_data(request_id, timestamp, ssml):
)
class Communicate: # pylint: disable=too-few-public-methods
class Communicate:
"""
Class for communicating with the service.
"""
@@ -215,7 +215,7 @@ class Communicate: # pylint: disable=too-few-public-methods
volume="+0%",
customspeak=False,
proxy=None,
): # pylint: disable=too-many-arguments, too-many-locals
):
"""
Runs the Communicate class.
@@ -234,14 +234,14 @@ class Communicate: # pylint: disable=too-few-public-methods
"""
word_boundary = False
sentence_boundary = False
if boundary_type > 0:
word_boundary = True
if boundary_type > 1:
sentence_boundary = True
raise ValueError(
"Invalid boundary type. SentenceBoundary is no longer supported."
)
sentence_boundary = str(sentence_boundary).lower()
word_boundary = str(word_boundary).lower()
if not customspeak:
@@ -262,12 +262,8 @@ class Communicate: # pylint: disable=too-few-public-methods
if isinstance(messages, str):
messages = [messages]
# Variables for the loop
download = False
current_subtitle = ""
first_offset = None
last_offset = None
async with aiohttp.ClientSession(trust_env=True) as session:
async with session.ws_connect(
f"{WSS_URL}&ConnectionId={connect_id()}",
@@ -304,7 +300,7 @@ class Communicate: # pylint: disable=too-few-public-methods
"Content-Type:application/json; charset=utf-8\r\n"
"Path:speech.config\r\n\r\n"
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
f'"sentenceBoundaryEnabled":{sentence_boundary},'
f'"sentenceBoundaryEnabled":false,'
f'"wordBoundaryEnabled":{word_boundary}}},"outputFormat":"{codec}"'
"}}}}\r\n"
)
@@ -326,12 +322,6 @@ class Communicate: # pylint: disable=too-few-public-methods
# Begin listening for the response.
async for received in websocket:
if received.type in (
aiohttp.WSMsgType.CLOSED,
aiohttp.WSMsgType.ERROR,
):
break
if received.type == aiohttp.WSMsgType.TEXT:
parameters, data = get_headers_and_data(received.data)
if (
@@ -358,12 +348,15 @@ class Communicate: # pylint: disable=too-few-public-methods
metadata_duration = metadata["Metadata"][0]["Data"][
"Duration"
]
except KeyError:
metadata_duration = 0
except KeyError as exception:
raise ValueError(
"The metadata doesn't contain a Duration field. "
+ "This usually happens when SentenceBoundary metadata type is sent."
) from exception
metadata_text = metadata["Metadata"][0]["Data"]["text"][
"Text"
]
if boundary_type == 1:
if metadata_type == "WordBoundary":
yield (
[
metadata_offset,
@@ -372,31 +365,32 @@ class Communicate: # pylint: disable=too-few-public-methods
metadata_text,
None,
)
elif metadata_type == "SentenceBoundary":
raise NotImplementedError(
"SentenceBoundary is not supported due to being broken."
)
else:
if metadata_type == "WordBoundary":
if current_subtitle:
current_subtitle += " "
current_subtitle += metadata_text
if first_offset is None:
first_offset = metadata_offset
last_offset = [
metadata_offset,
metadata_duration,
]
elif metadata_type == "SentenceBoundary":
if current_subtitle:
yield (
[
first_offset,
sum(last_offset) - first_offset,
],
current_subtitle,
None,
)
current_subtitle = ""
first_offset = None
last_offset = None
raise NotImplementedError(
f"Unknown metadata type: {metadata_type}"
)
elif (
"Path" in parameters
and parameters["Path"] == "response"
):
# TODO: implement this:
"""
X-RequestId:xxxxxxxxxxxxxxxxxxxxxxxxx
Content-Type:application/json; charset=utf-8
Path:response
{"context":{"serviceTag":"yyyyyyyyyyyyyyyyyyy"},"audio":{"type":"inline","streamId":"zzzzzzzzzzzzzzzzz"}}
"""
pass
else:
raise ValueError(
"The response from the service is not recognized.\n"
+ received.data
)
elif received.type == aiohttp.WSMsgType.BINARY:
if download:
yield (
@@ -406,10 +400,8 @@ class Communicate: # pylint: disable=too-few-public-methods
received.data.split(b"Path:audio\r\n")[1:]
),
)
if current_subtitle:
yield (
[first_offset, sum(last_offset) - first_offset],
current_subtitle,
None,
)
else:
raise ValueError(
"The service sent a binary message, but we are not expecting one."
)
await websocket.close()

View File

@@ -9,7 +9,7 @@ import aiohttp
from .constants import VOICE_LIST
async def list_voices():
async def list_voices(proxy=None):
"""
List all available voices and their attributes.
@@ -36,6 +36,7 @@ async def list_voices():
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "en-US,en;q=0.9",
},
proxy=proxy,
) as url:
data = json.loads(await url.text())
return data

View File

@@ -39,7 +39,7 @@ class SubMaker:
SubMaker class
"""
def __init__(self, overlapping=5):
def __init__(self, overlapping=1):
"""
SubMaker constructor.
@@ -48,7 +48,6 @@ class SubMaker:
subtitles should overlap.
"""
self.subs_and_offset = []
self.broken_offset = 0
self.overlapping = overlapping * (10**7)
def create_sub(self, timestamp, text):
@@ -64,13 +63,6 @@ class SubMaker:
None
"""
timestamp[1] += timestamp[0]
if len(self.subs_and_offset) >= 2:
if self.subs_and_offset[-2][1] >= timestamp[0] + self.broken_offset:
self.broken_offset = self.subs_and_offset[-2][1]
timestamp[0] += self.broken_offset
timestamp[1] += self.broken_offset
self.subs_and_offset.append(timestamp)
self.subs_and_offset.append(text)

View File

@@ -10,11 +10,11 @@ import sys
from edge_tts import Communicate, SubMaker, list_voices
async def _list_voices():
async def _list_voices(proxy):
"""
List available voices.
"""
for idx, voice in enumerate(await list_voices()):
for idx, voice in enumerate(await list_voices(proxy=proxy)):
if idx != 0:
print()
@@ -112,13 +112,13 @@ async def _main():
"-O",
"--overlapping",
help="overlapping subtitles in seconds",
default=5,
default=1,
type=float,
)
parser.add_argument(
"-b",
"--boundary-type",
help="set boundary type for subtitles. Default 0 for none. Set 1 for word_boundary, 2 for sentence_boundary",
help="set boundary type for subtitles. Default 0 for none. Set 1 for word_boundary.",
default=0,
type=int,
)
@@ -136,7 +136,7 @@ async def _main():
args = parser.parse_args()
if args.list_voices:
await _list_voices()
await _list_voices(args.proxy)
sys.exit(0)
if args.text is not None or args.file is not None: