Update edge_tts to v6.1.3

* Sort voices in util's --list-voices * Disable overlapping subtitles by default * Try to fix subtitles for many hour long TTS generation * Match Microsoft Edge connection behavior more * Bump edge_tts version
2023-01-25 18:29:40 +02:00
parent 85eef7d918
commit 58307ded26
6 changed files with 56 additions and 26 deletions
--- a/lint.sh
+++ b/lint.sh
@@ -1,2 +1,4 @@
+#!/bin/sh
+set -ux
 find src examples -name '*.py' | xargs pylint
 find src examples -name '*.py' | xargs mypy
--- a/4
+++ b/4
@@ -283,10 +283,10 @@ max-attributes=7
 max-bool-expr=5

 # Maximum number of branch for function / method body.
-max-branches=15
+max-branches=16

 # Maximum number of locals for function / method body.
-max-locals=15
+max-locals=18

 # Maximum number of parents for a class (see R0901).
 max-parents=7
--- a/src/edge_tts/communicate.py
+++ b/src/edge_tts/communicate.py
@@ -254,9 +254,15 @@ class Communicate:
        self.voice: str = voice
        match = re.match(r"^([a-z]{2})-([A-Z]{2})-(.+Neural)$", voice)
        if match is not None:
+            lang = match.group(1)
+            region = match.group(2)
+            name = match.group(3)
+            if name.find("-") != -1:
+                region = region + "-" + name[: name.find("-")]
+                name = name[name.find("-") + 1 :]
            self.voice = (
                "Microsoft Server Speech Text to Speech Voice"
-                + f" ({match.group(1)}-{match.group(2)}, {match.group(3)})"
+                + f" ({lang}-{region}, {name})"
            )

        if (
@@ -291,24 +297,29 @@ class Communicate:
            escape(remove_incompatible_characters(self.text)),
            calc_max_mesg_size(self.voice, self.rate, self.volume),
        )
+        final_utterance: Dict[int, int] = {}
+        prev_idx = -1
+        shift_time = -1

-        async with aiohttp.ClientSession(trust_env=True) as session, session.ws_connect(
-            f"{WSS_URL}&ConnectionId={connect_id()}",
-            compress=15,
-            autoclose=True,
-            autoping=True,
-            proxy=self.proxy,
-            headers={
-                "Pragma": "no-cache",
-                "Cache-Control": "no-cache",
-                "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
-                "Accept-Encoding": "gzip, deflate, br",
-                "Accept-Language": "en-US,en;q=0.9",
-                "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
-                " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
-            },
-        ) as websocket:
-            for text in texts:
+        for idx, text in enumerate(texts):
+            async with aiohttp.ClientSession(
+                trust_env=True
+            ) as session, session.ws_connect(
+                f"{WSS_URL}&ConnectionId={connect_id()}",
+                compress=15,
+                autoclose=True,
+                autoping=True,
+                proxy=self.proxy,
+                headers={
+                    "Pragma": "no-cache",
+                    "Cache-Control": "no-cache",
+                    "Origin": "chrome-extension://jdiccldimpdaibmpdkjnbmckianbfold",
+                    "Accept-Encoding": "gzip, deflate, br",
+                    "Accept-Language": "en-US,en;q=0.9",
+                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+                    " (KHTML, like Gecko) Chrome/91.0.4472.77 Safari/537.36 Edg/91.0.864.41",
+                },
+            ) as websocket:
                # download indicates whether we should be expecting audio data,
                # this is so what we avoid getting binary data from the websocket
                # and falsely thinking it's audio data.
@@ -362,10 +373,25 @@ class Communicate:
                        elif path == b"audio.metadata":
                            for meta_obj in json.loads(data)["Metadata"]:
                                meta_type = meta_obj["Type"]
+                                if idx != prev_idx:
+                                    shift_time = sum(
+                                        final_utterance[i] for i in range(idx)
+                                    )
+                                    prev_idx = idx
                                if meta_type == "WordBoundary":
+                                    final_utterance[idx] = (
+                                        meta_obj["Data"]["Offset"]
+                                        + meta_obj["Data"]["Duration"]
+                                        # Average padding added by the service
+                                        # Alternatively we could use ffmpeg to get value properly
+                                        # but I don't want to add an additional dependency
+                                        # if this is found to work well enough.
+                                        + 8_750_000
+                                    )
                                    yield {
                                        "type": meta_type,
-                                        "offset": meta_obj["Data"]["Offset"],
+                                        "offset": meta_obj["Data"]["Offset"]
+                                        + shift_time,
                                        "duration": meta_obj["Data"]["Duration"],
                                        "text": meta_obj["Data"]["text"]["Text"],
                                    }
--- a/src/edge_tts/submaker.py
+++ b/src/edge_tts/submaker.py
@@ -40,7 +40,7 @@ class SubMaker:
    SubMaker class
    """

-    def __init__(self, overlapping: int = 1) -> None:
+    def __init__(self, overlapping: int = 0) -> None:
        """
        SubMaker constructor.

--- a/src/edge_tts/util.py
+++ b/src/edge_tts/util.py
@@ -14,7 +14,9 @@ from edge_tts import Communicate, SubMaker, list_voices

 async def _print_voices(*, proxy: str) -> None:
    """Print all available voices."""
-    for idx, voice in enumerate(await list_voices(proxy=proxy)):
+    voices = await list_voices(proxy=proxy)
+    voices = sorted(voices, key=lambda voice: voice["ShortName"])  # type: ignore
+    for idx, voice in enumerate(voices):
        if idx != 0:
            print()

@@ -82,8 +84,8 @@ async def _async_main() -> None:
    parser.add_argument(
        "-O",
        "--overlapping",
-        help="overlapping subtitles in seconds",
-        default=1,
+        help="overlapping subtitles in seconds. Default: 0.",
+        default=0,
        type=float,
    )
    parser.add_argument(
--- a/src/edge_tts/version.py
+++ b/src/edge_tts/version.py
@@ -1,4 +1,4 @@
 """Edge TTS version information."""

-__version__ = "6.1.1"
+__version__ = "6.1.3"
 __version_info__ = tuple(int(num) for num in __version__.split("."))