Update edge-tts to version 4.0.11

* Add pylint check for lint.sh * Change overlapping default from 5second to 1second for SubMaker and util.py * Default to WordBoundary for edge-playback (from SentenceBoundary) * Drop SentenceBoundary support (never works properly and too many hacks) - No longer actually supported by Azure's official SDK for a few years already * Stop attempting to correct broken offsets sent back to us by Azure - Fixes never work properly because AI voice sometimes takes odd pauses at the start and sometimes doesn't do so. It's never predictable and cannot be fixed on the library's end. - Solution is for Microsoft to fix the integer overflow bug they are facing in the {Word,Sentence}Boundary offsets. It doesn't affect us until we reach 30min long TTS anyway. * Have edge-tts --list-voices use the configured HTTP proxy * More misc changes and fixes
2022-05-29 18:08:16 +03:00
parent 9a20f1ca90
commit 797d04f182
7 changed files with 53 additions and 67 deletions
--- a/lint.sh
+++ b/lint.sh
@@ -1,2 +1,3 @@
 find src examples -name '*.py' | xargs black
 find src examples -name '*.py' | xargs isort
+find src examples -name '*.py' | xargs pylint
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,6 +1,6 @@
 [metadata]
 name = edge-tts
-version = 4.0.10
+version = 4.0.11
 author = rany
 author_email = ranygh@riseup.net
 description = Microsoft Edge's TTS
--- a/src/edge_playback/init.py
+++ b/src/edge_playback/init.py
@@ -28,7 +28,7 @@ def main():
            with subprocess.Popen(
                [
                    "edge-tts",
-                    "--boundary-type=2",
+                    "--boundary-type=1",
                    f"--write-media={media.name}",
                    f"--write-subtitles={subtitle.name}",
                ]
--- a/src/edge_tts/communicate.py
+++ b/src/edge_tts/communicate.py
@@ -55,8 +55,8 @@ def remove_incompatible_characters(string):

    string = list(string)

-    for idx in range(len(string)):  # pylint: disable=consider-using-enumerate
-        code = ord(string[idx])
+    for idx, char in enumerate(string):
+        code = ord(char)
        if (0 <= code <= 8) or (11 <= code <= 12) or (14 <= code <= 31):
            string[idx] = " "

@@ -193,7 +193,7 @@ def ssml_headers_plus_data(request_id, timestamp, ssml):
    )


-class Communicate:  # pylint: disable=too-few-public-methods
+class Communicate:
    """
    Class for communicating with the service.
    """
@@ -215,7 +215,7 @@ class Communicate:  # pylint: disable=too-few-public-methods
        volume="+0%",
        customspeak=False,
        proxy=None,
-    ):  # pylint: disable=too-many-arguments, too-many-locals
+    ):
        """
        Runs the Communicate class.

@@ -234,14 +234,14 @@ class Communicate:  # pylint: disable=too-few-public-methods
        """

        word_boundary = False
-        sentence_boundary = False

        if boundary_type > 0:
            word_boundary = True
        if boundary_type > 1:
-            sentence_boundary = True
+            raise ValueError(
+                "Invalid boundary type. SentenceBoundary is no longer supported."
+            )

-        sentence_boundary = str(sentence_boundary).lower()
        word_boundary = str(word_boundary).lower()

        if not customspeak:
@@ -262,12 +262,8 @@ class Communicate:  # pylint: disable=too-few-public-methods
            if isinstance(messages, str):
                messages = [messages]

-
        # Variables for the loop
        download = False
-        current_subtitle = ""
-        first_offset = None
-        last_offset = None
        async with aiohttp.ClientSession(trust_env=True) as session:
            async with session.ws_connect(
                f"{WSS_URL}&ConnectionId={connect_id()}",
@@ -304,7 +300,7 @@ class Communicate:  # pylint: disable=too-few-public-methods
                        "Content-Type:application/json; charset=utf-8\r\n"
                        "Path:speech.config\r\n\r\n"
                        '{"context":{"synthesis":{"audio":{"metadataoptions":{'
-                        f'"sentenceBoundaryEnabled":{sentence_boundary},'
+                        f'"sentenceBoundaryEnabled":false,'
                        f'"wordBoundaryEnabled":{word_boundary}}},"outputFormat":"{codec}"'
                        "}}}}\r\n"
                    )
@@ -326,12 +322,6 @@ class Communicate:  # pylint: disable=too-few-public-methods

                    # Begin listening for the response.
                    async for received in websocket:
-                        if received.type in (
-                            aiohttp.WSMsgType.CLOSED,
-                            aiohttp.WSMsgType.ERROR,
-                        ):
-                            break
-
                        if received.type == aiohttp.WSMsgType.TEXT:
                            parameters, data = get_headers_and_data(received.data)
                            if (
@@ -358,12 +348,15 @@ class Communicate:  # pylint: disable=too-few-public-methods
                                    metadata_duration = metadata["Metadata"][0]["Data"][
                                        "Duration"
                                    ]
-                                except KeyError:
-                                    metadata_duration = 0
+                                except KeyError as exception:
+                                    raise ValueError(
+                                        "The metadata doesn't contain a Duration field. "
+                                        + "This usually happens when SentenceBoundary metadata type is sent."
+                                    ) from exception
                                metadata_text = metadata["Metadata"][0]["Data"]["text"][
                                    "Text"
                                ]
-                                if boundary_type == 1:
+                                if metadata_type == "WordBoundary":
                                    yield (
                                        [
                                            metadata_offset,
@@ -372,31 +365,32 @@ class Communicate:  # pylint: disable=too-few-public-methods
                                        metadata_text,
                                        None,
                                    )
+                                elif metadata_type == "SentenceBoundary":
+                                    raise NotImplementedError(
+                                        "SentenceBoundary is not supported due to being broken."
+                                    )
                                else:
-                                    if metadata_type == "WordBoundary":
-                                        if current_subtitle:
-                                            current_subtitle += " "
-                                        current_subtitle += metadata_text
-                                        if first_offset is None:
-                                            first_offset = metadata_offset
-                                        last_offset = [
-                                            metadata_offset,
-                                            metadata_duration,
-                                        ]
-                                    elif metadata_type == "SentenceBoundary":
-                                        if current_subtitle:
-                                            yield (
-                                                [
-                                                    first_offset,
-                                                    sum(last_offset) - first_offset,
-                                                ],
-                                                current_subtitle,
-                                                None,
-                                            )
-                                        current_subtitle = ""
-                                        first_offset = None
-                                        last_offset = None
+                                    raise NotImplementedError(
+                                        f"Unknown metadata type: {metadata_type}"
+                                    )
+                            elif (
+                                "Path" in parameters
+                                and parameters["Path"] == "response"
+                            ):
+                                # TODO: implement this:
+                                """
+                                X-RequestId:xxxxxxxxxxxxxxxxxxxxxxxxx
+                                Content-Type:application/json; charset=utf-8
+                                Path:response

+                                {"context":{"serviceTag":"yyyyyyyyyyyyyyyyyyy"},"audio":{"type":"inline","streamId":"zzzzzzzzzzzzzzzzz"}}
+                                """
+                                pass
+                            else:
+                                raise ValueError(
+                                    "The response from the service is not recognized.\n"
+                                    + received.data
+                                )
                        elif received.type == aiohttp.WSMsgType.BINARY:
                            if download:
                                yield (
@@ -406,10 +400,8 @@ class Communicate:  # pylint: disable=too-few-public-methods
                                        received.data.split(b"Path:audio\r\n")[1:]
                                    ),
                                )
-                if current_subtitle:
-                    yield (
-                        [first_offset, sum(last_offset) - first_offset],
-                        current_subtitle,
-                        None,
-                    )
+                            else:
+                                raise ValueError(
+                                    "The service sent a binary message, but we are not expecting one."
+                                )
                await websocket.close()
--- a/src/edge_tts/list_voices.py
+++ b/src/edge_tts/list_voices.py
@@ -9,7 +9,7 @@ import aiohttp
 from .constants import VOICE_LIST


-async def list_voices():
+async def list_voices(proxy=None):
    """
    List all available voices and their attributes.

@@ -36,6 +36,7 @@ async def list_voices():
                "Accept-Encoding": "gzip, deflate, br",
                "Accept-Language": "en-US,en;q=0.9",
            },
+            proxy=proxy,
        ) as url:
            data = json.loads(await url.text())
    return data
--- a/src/edge_tts/submaker.py
+++ b/src/edge_tts/submaker.py
@@ -39,7 +39,7 @@ class SubMaker:
    SubMaker class
    """

-    def __init__(self, overlapping=5):
+    def __init__(self, overlapping=1):
        """
        SubMaker constructor.

@@ -48,7 +48,6 @@ class SubMaker:
                               subtitles should overlap.
        """
        self.subs_and_offset = []
-        self.broken_offset = 0
        self.overlapping = overlapping * (10**7)

    def create_sub(self, timestamp, text):
@@ -64,13 +63,6 @@ class SubMaker:
            None
        """
        timestamp[1] += timestamp[0]
-
-        if len(self.subs_and_offset) >= 2:
-            if self.subs_and_offset[-2][1] >= timestamp[0] + self.broken_offset:
-                self.broken_offset = self.subs_and_offset[-2][1]
-            timestamp[0] += self.broken_offset
-            timestamp[1] += self.broken_offset
-
        self.subs_and_offset.append(timestamp)
        self.subs_and_offset.append(text)

--- a/src/edge_tts/util.py
+++ b/src/edge_tts/util.py
@@ -10,11 +10,11 @@ import sys
 from edge_tts import Communicate, SubMaker, list_voices


-async def _list_voices():
+async def _list_voices(proxy):
    """
    List available voices.
    """
-    for idx, voice in enumerate(await list_voices()):
+    for idx, voice in enumerate(await list_voices(proxy=proxy)):
        if idx != 0:
            print()

@@ -112,13 +112,13 @@ async def _main():
        "-O",
        "--overlapping",
        help="overlapping subtitles in seconds",
-        default=5,
+        default=1,
        type=float,
    )
    parser.add_argument(
        "-b",
        "--boundary-type",
-        help="set boundary type for subtitles. Default 0 for none. Set 1 for word_boundary, 2 for sentence_boundary",
+        help="set boundary type for subtitles. Default 0 for none. Set 1 for word_boundary.",
        default=0,
        type=int,
    )
@@ -136,7 +136,7 @@ async def _main():
    args = parser.parse_args()

    if args.list_voices:
-        await _list_voices()
+        await _list_voices(args.proxy)
        sys.exit(0)

    if args.text is not None or args.file is not None: