Update edge-tts to version 4.0.11

* Add pylint check for lint.sh
* Change overlapping default from 5second to 1second for SubMaker and util.py
* Default to WordBoundary for edge-playback (from SentenceBoundary)
* Drop SentenceBoundary support (never works properly and too many hacks)
  - No longer actually supported by Azure's official SDK for a few years
    already
* Stop attempting to correct broken offsets sent back to us by Azure
  - Fixes never work properly because AI voice sometimes takes odd pauses
    at the start and sometimes doesn't do so. It's never predictable and
    cannot be fixed on the library's end.
  - Solution is for Microsoft to fix the integer overflow bug they are facing
    in the {Word,Sentence}Boundary offsets. It doesn't affect us until we reach
    30min long TTS anyway.
* Have edge-tts --list-voices use the configured HTTP proxy
* More misc changes and fixes
This commit is contained in:
rany2
2022-05-29 18:08:16 +03:00
parent 9a20f1ca90
commit 797d04f182
7 changed files with 53 additions and 67 deletions

View File

@@ -55,8 +55,8 @@ def remove_incompatible_characters(string):
string = list(string)
for idx in range(len(string)): # pylint: disable=consider-using-enumerate
code = ord(string[idx])
for idx, char in enumerate(string):
code = ord(char)
if (0 <= code <= 8) or (11 <= code <= 12) or (14 <= code <= 31):
string[idx] = " "
@@ -193,7 +193,7 @@ def ssml_headers_plus_data(request_id, timestamp, ssml):
)
class Communicate: # pylint: disable=too-few-public-methods
class Communicate:
"""
Class for communicating with the service.
"""
@@ -215,7 +215,7 @@ class Communicate: # pylint: disable=too-few-public-methods
volume="+0%",
customspeak=False,
proxy=None,
): # pylint: disable=too-many-arguments, too-many-locals
):
"""
Runs the Communicate class.
@@ -234,14 +234,14 @@ class Communicate: # pylint: disable=too-few-public-methods
"""
word_boundary = False
sentence_boundary = False
if boundary_type > 0:
word_boundary = True
if boundary_type > 1:
sentence_boundary = True
raise ValueError(
"Invalid boundary type. SentenceBoundary is no longer supported."
)
sentence_boundary = str(sentence_boundary).lower()
word_boundary = str(word_boundary).lower()
if not customspeak:
@@ -262,12 +262,8 @@ class Communicate: # pylint: disable=too-few-public-methods
if isinstance(messages, str):
messages = [messages]
# Variables for the loop
download = False
current_subtitle = ""
first_offset = None
last_offset = None
async with aiohttp.ClientSession(trust_env=True) as session:
async with session.ws_connect(
f"{WSS_URL}&ConnectionId={connect_id()}",
@@ -304,7 +300,7 @@ class Communicate: # pylint: disable=too-few-public-methods
"Content-Type:application/json; charset=utf-8\r\n"
"Path:speech.config\r\n\r\n"
'{"context":{"synthesis":{"audio":{"metadataoptions":{'
f'"sentenceBoundaryEnabled":{sentence_boundary},'
f'"sentenceBoundaryEnabled":false,'
f'"wordBoundaryEnabled":{word_boundary}}},"outputFormat":"{codec}"'
"}}}}\r\n"
)
@@ -326,12 +322,6 @@ class Communicate: # pylint: disable=too-few-public-methods
# Begin listening for the response.
async for received in websocket:
if received.type in (
aiohttp.WSMsgType.CLOSED,
aiohttp.WSMsgType.ERROR,
):
break
if received.type == aiohttp.WSMsgType.TEXT:
parameters, data = get_headers_and_data(received.data)
if (
@@ -358,12 +348,15 @@ class Communicate: # pylint: disable=too-few-public-methods
metadata_duration = metadata["Metadata"][0]["Data"][
"Duration"
]
except KeyError:
metadata_duration = 0
except KeyError as exception:
raise ValueError(
"The metadata doesn't contain a Duration field. "
+ "This usually happens when SentenceBoundary metadata type is sent."
) from exception
metadata_text = metadata["Metadata"][0]["Data"]["text"][
"Text"
]
if boundary_type == 1:
if metadata_type == "WordBoundary":
yield (
[
metadata_offset,
@@ -372,31 +365,32 @@ class Communicate: # pylint: disable=too-few-public-methods
metadata_text,
None,
)
elif metadata_type == "SentenceBoundary":
raise NotImplementedError(
"SentenceBoundary is not supported due to being broken."
)
else:
if metadata_type == "WordBoundary":
if current_subtitle:
current_subtitle += " "
current_subtitle += metadata_text
if first_offset is None:
first_offset = metadata_offset
last_offset = [
metadata_offset,
metadata_duration,
]
elif metadata_type == "SentenceBoundary":
if current_subtitle:
yield (
[
first_offset,
sum(last_offset) - first_offset,
],
current_subtitle,
None,
)
current_subtitle = ""
first_offset = None
last_offset = None
raise NotImplementedError(
f"Unknown metadata type: {metadata_type}"
)
elif (
"Path" in parameters
and parameters["Path"] == "response"
):
# TODO: implement this:
"""
X-RequestId:xxxxxxxxxxxxxxxxxxxxxxxxx
Content-Type:application/json; charset=utf-8
Path:response
{"context":{"serviceTag":"yyyyyyyyyyyyyyyyyyy"},"audio":{"type":"inline","streamId":"zzzzzzzzzzzzzzzzz"}}
"""
pass
else:
raise ValueError(
"The response from the service is not recognized.\n"
+ received.data
)
elif received.type == aiohttp.WSMsgType.BINARY:
if download:
yield (
@@ -406,10 +400,8 @@ class Communicate: # pylint: disable=too-few-public-methods
received.data.split(b"Path:audio\r\n")[1:]
),
)
if current_subtitle:
yield (
[first_offset, sum(last_offset) - first_offset],
current_subtitle,
None,
)
else:
raise ValueError(
"The service sent a binary message, but we are not expecting one."
)
await websocket.close()