Update edge-tts to version 4.0.11

* Add pylint check for lint.sh * Change overlapping default from 5second to 1second for SubMaker and util.py * Default to WordBoundary for edge-playback (from SentenceBoundary) * Drop SentenceBoundary support (never works properly and too many hacks) - No longer actually supported by Azure's official SDK for a few years already * Stop attempting to correct broken offsets sent back to us by Azure - Fixes never work properly because AI voice sometimes takes odd pauses at the start and sometimes doesn't do so. It's never predictable and cannot be fixed on the library's end. - Solution is for Microsoft to fix the integer overflow bug they are facing in the {Word,Sentence}Boundary offsets. It doesn't affect us until we reach 30min long TTS anyway. * Have edge-tts --list-voices use the configured HTTP proxy * More misc changes and fixes
2022-05-29 18:08:16 +03:00
parent 9a20f1ca90
commit 797d04f182
7 changed files with 53 additions and 67 deletions
--- a/lint.sh
+++ b/lint.sh
@@ -1,2 +1,3 @@
 find src examples -name '*.py' | xargs black
 find src examples -name '*.py' | xargs isort
 find src examples -name '*.py' | xargs pylint
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = edge-tts
-version = 4.0.10
+version = 4.0.11
 author = rany
 author_email = ranygh@riseup.net
 description = Microsoft Edge's TTS
--- a/src/edge_playback/init.py
+++ b/src/edge_playback/init.py
@@ -28,7 +28,7 @@ def main():
            with subprocess.Popen(
                [
                    "edge-tts",
-                    "--boundary-type=2",
+                    "--boundary-type=1",
                    f"--write-media={media.name}",
                    f"--write-subtitles={subtitle.name}",
                ]
--- a/src/edge_tts/communicate.py
+++ b/src/edge_tts/communicate.py
@@ -55,8 +55,8 @@ def remove_incompatible_characters(string):
    string = list(string)
-    for idx in range(len(string)):  # pylint: disable=consider-using-enumerate
+    for idx, char in enumerate(string):
-        code = ord(string[idx])
+        code = ord(char)
        if (0 <= code <= 8) or (11 <= code <= 12) or (14 <= code <= 31):
            string[idx] = " "
@@ -193,7 +193,7 @@ def ssml_headers_plus_data(request_id, timestamp, ssml):
    )
-class Communicate:  # pylint: disable=too-few-public-methods
+class Communicate:
    """
    Class for communicating with the service.
    """
@@ -215,7 +215,7 @@ class Communicate:  # pylint: disable=too-few-public-methods
        volume="+0%",
        customspeak=False,
        proxy=None,
-    ):  # pylint: disable=too-many-arguments, too-many-locals
+    ):
        """
        Runs the Communicate class.
@@ -234,14 +234,14 @@ class Communicate:  # pylint: disable=too-few-public-methods
        """
        word_boundary = False
        sentence_boundary = False
        if boundary_type > 0:
            word_boundary = True
        if boundary_type > 1:
-            sentence_boundary = True
+            raise ValueError(
                "Invalid boundary type. SentenceBoundary is no longer supported."
            )
        sentence_boundary = str(sentence_boundary).lower()
        word_boundary = str(word_boundary).lower()
        if not customspeak:
@@ -262,12 +262,8 @@ class Communicate:  # pylint: disable=too-few-public-methods
            if isinstance(messages, str):
                messages = [messages]
        # Variables for the loop
        download = False
        current_subtitle = ""
        first_offset = None
        last_offset = None
        async with aiohttp.ClientSession(trust_env=True) as session:
            async with session.ws_connect(
                f"{WSS_URL}&ConnectionId={connect_id()}",
@@ -304,7 +300,7 @@ class Communicate:  # pylint: disable=too-few-public-methods
                        "Content-Type:application/json; charset=utf-8\r\n"
                        "Path:speech.config\r\n\r\n"
                        '{"context":{"synthesis":{"audio":{"metadataoptions":{'
-                        f'"sentenceBoundaryEnabled":{sentence_boundary},'
+                        f'"sentenceBoundaryEnabled":false,'
                        f'"wordBoundaryEnabled":{word_boundary}}},"outputFormat":"{codec}"'
                        "}}}}\r\n"
                    )
@@ -326,12 +322,6 @@ class Communicate:  # pylint: disable=too-few-public-methods
                    # Begin listening for the response.
                    async for received in websocket:
                        if received.type in (
                            aiohttp.WSMsgType.CLOSED,
                            aiohttp.WSMsgType.ERROR,
                        ):
                            break
                        if received.type == aiohttp.WSMsgType.TEXT:
                            parameters, data = get_headers_and_data(received.data)
                            if (
@@ -358,12 +348,15 @@ class Communicate:  # pylint: disable=too-few-public-methods
                                    metadata_duration = metadata["Metadata"][0]["Data"][
                                        "Duration"
                                    ]
-                                except KeyError:
+                                except KeyError as exception:
-                                    metadata_duration = 0
+                                    raise ValueError(
                                        "The metadata doesn't contain a Duration field. "
                                        + "This usually happens when SentenceBoundary metadata type is sent."
                                    ) from exception
                                metadata_text = metadata["Metadata"][0]["Data"]["text"][
                                    "Text"
                                ]
-                                if boundary_type == 1:
+                                if metadata_type == "WordBoundary":
                                    yield (
                                        [
                                            metadata_offset,
@@ -372,31 +365,32 @@ class Communicate:  # pylint: disable=too-few-public-methods
                                        metadata_text,
                                        None,
                                    )
                                elif metadata_type == "SentenceBoundary":
                                    raise NotImplementedError(
                                        "SentenceBoundary is not supported due to being broken."
                                    )
                                else:
-                                    if metadata_type == "WordBoundary":
+                                    raise NotImplementedError(
-                                        if current_subtitle:
+                                        f"Unknown metadata type: {metadata_type}"
-                                            current_subtitle += " "
+                                    )
-                                        current_subtitle += metadata_text
+                            elif (
-                                        if first_offset is None:
+                                "Path" in parameters
-                                            first_offset = metadata_offset
+                                and parameters["Path"] == "response"
-                                        last_offset = [
+                            ):
-                                            metadata_offset,
+                                # TODO: implement this:
-                                            metadata_duration,
+                                """
-                                        ]
+                                X-RequestId:xxxxxxxxxxxxxxxxxxxxxxxxx
-                                    elif metadata_type == "SentenceBoundary":
+                                Content-Type:application/json; charset=utf-8
-                                        if current_subtitle:
+                                Path:response
                                            yield (
                                                [
                                                    first_offset,
                                                    sum(last_offset) - first_offset,
                                                ],
                                                current_subtitle,
                                                None,
                                            )
                                        current_subtitle = ""
                                        first_offset = None
                                        last_offset = None
                                {"context":{"serviceTag":"yyyyyyyyyyyyyyyyyyy"},"audio":{"type":"inline","streamId":"zzzzzzzzzzzzzzzzz"}}
                                """
                                pass
                            else:
                                raise ValueError(
                                    "The response from the service is not recognized.\n"
                                    + received.data
                                )
                        elif received.type == aiohttp.WSMsgType.BINARY:
                            if download:
                                yield (
@@ -406,10 +400,8 @@ class Communicate:  # pylint: disable=too-few-public-methods
                                        received.data.split(b"Path:audio\r\n")[1:]
                                    ),
                                )
-                if current_subtitle:
+                            else:
-                    yield (
+                                raise ValueError(
-                        [first_offset, sum(last_offset) - first_offset],
+                                    "The service sent a binary message, but we are not expecting one."
-                        current_subtitle,
+                                )
                        None,
                    )
                await websocket.close()
--- a/src/edge_tts/list_voices.py
+++ b/src/edge_tts/list_voices.py
@@ -9,7 +9,7 @@ import aiohttp
 from .constants import VOICE_LIST
-async def list_voices():
+async def list_voices(proxy=None):
    """
    List all available voices and their attributes.
@@ -36,6 +36,7 @@ async def list_voices():
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "en-US,en;q=0.9",
            },
            proxy=proxy,
        ) as url:
            data = json.loads(await url.text())
    return data
--- a/src/edge_tts/submaker.py
+++ b/src/edge_tts/submaker.py
@@ -39,7 +39,7 @@ class SubMaker:
    SubMaker class
    """
-    def __init__(self, overlapping=5):
+    def __init__(self, overlapping=1):
        """
        SubMaker constructor.
@@ -48,7 +48,6 @@ class SubMaker:
                               subtitles should overlap.
        """
        self.subs_and_offset = []
        self.broken_offset = 0
        self.overlapping = overlapping * (10**7)
    def create_sub(self, timestamp, text):
@@ -64,13 +63,6 @@ class SubMaker:
            None
        """
        timestamp[1] += timestamp[0]
        if len(self.subs_and_offset) >= 2:
            if self.subs_and_offset[-2][1] >= timestamp[0] + self.broken_offset:
                self.broken_offset = self.subs_and_offset[-2][1]
            timestamp[0] += self.broken_offset
            timestamp[1] += self.broken_offset
        self.subs_and_offset.append(timestamp)
        self.subs_and_offset.append(text)
--- a/src/edge_tts/util.py
+++ b/src/edge_tts/util.py
@@ -10,11 +10,11 @@ import sys
 from edge_tts import Communicate, SubMaker, list_voices
-async def _list_voices():
+async def _list_voices(proxy):
    """
    List available voices.
    """
-    for idx, voice in enumerate(await list_voices()):
+    for idx, voice in enumerate(await list_voices(proxy=proxy)):
        if idx != 0:
            print()
@@ -112,13 +112,13 @@ async def _main():
        "-O",
        "--overlapping",
        help="overlapping subtitles in seconds",
-        default=5,
+        default=1,
        type=float,
    )
    parser.add_argument(
        "-b",
        "--boundary-type",
-        help="set boundary type for subtitles. Default 0 for none. Set 1 for word_boundary, 2 for sentence_boundary",
+        help="set boundary type for subtitles. Default 0 for none. Set 1 for word_boundary.",
        default=0,
        type=int,
    )
@@ -136,7 +136,7 @@ async def _main():
    args = parser.parse_args()
    if args.list_voices:
-        await _list_voices()
+        await _list_voices(args.proxy)
        sys.exit(0)
    if args.text is not None or args.file is not None: