Drop words_in_cue code for SubMaker and switch to SRT (#323)

WebVTT isn't a very common format in the first place
and attempting to make WordBoundary play nice with
input text is very hard.

Instead we now just display the word that the TTS
is saying at a given time. In the future, we could
try to enable SentenceBoundary but there is a risk
that it will be banned by Microsoft as it is not used
by Microsoft Edge itself.

Closes: https://github.com/rany2/edge-tts/issues/118
Closes: https://github.com/rany2/edge-tts/issues/171
Closes: https://github.com/rany2/edge-tts/issues/229
Closes: https://github.com/rany2/edge-tts/issues/234

Signed-off-by: rany <rany2@riseup.net>
This commit is contained in:
Rany
2024-11-22 20:58:47 +02:00
committed by GitHub
parent 5a2674cd03
commit 4f5d79ed57
6 changed files with 38 additions and 123 deletions

View File

@@ -18,7 +18,7 @@ If you only want to use the `edge-tts` and `edge-playback` commands, it would be
If you want to use the `edge-tts` command, you can simply run it with the following command: If you want to use the `edge-tts` command, you can simply run it with the following command:
$ edge-tts --text "Hello, world!" --write-media hello.mp3 --write-subtitles hello.vtt $ edge-tts --text "Hello, world!" --write-media hello.mp3 --write-subtitles hello.srt
If you wish to play it back immediately with subtitles, you could use the `edge-playback` command: If you wish to play it back immediately with subtitles, you could use the `edge-playback` command:
@@ -48,7 +48,7 @@ You can change the voice used by the text-to-speech service by using the `--voic
ar-EG-SalmaNeural Female General Friendly, Positive ar-EG-SalmaNeural Female General Friendly, Positive
... ...
$ edge-tts --voice ar-EG-SalmaNeural --text "مرحبا كيف حالك؟" --write-media hello_in_arabic.mp3 --write-subtitles hello_in_arabic.vtt $ edge-tts --voice ar-EG-SalmaNeural --text "مرحبا كيف حالك؟" --write-media hello_in_arabic.mp3 --write-subtitles hello_in_arabic.srt
### Custom SSML ### Custom SSML
@@ -58,9 +58,9 @@ Support for custom SSML was removed because Microsoft prevents the use of any SS
You can change the rate, volume and pitch of the generated speech by using the `--rate`, `--volume` and `--pitch` options. When using a negative value, you will need to use `--[option]=-50%` instead of `--[option] -50%` to avoid the option being interpreted as a command line option. You can change the rate, volume and pitch of the generated speech by using the `--rate`, `--volume` and `--pitch` options. When using a negative value, you will need to use `--[option]=-50%` instead of `--[option] -50%` to avoid the option being interpreted as a command line option.
$ edge-tts --rate=-50% --text "Hello, world!" --write-media hello_with_rate_lowered.mp3 --write-subtitles hello_with_rate_lowered.vtt $ edge-tts --rate=-50% --text "Hello, world!" --write-media hello_with_rate_lowered.mp3 --write-subtitles hello_with_rate_lowered.srt
$ edge-tts --volume=-50% --text "Hello, world!" --write-media hello_with_volume_lowered.mp3 --write-subtitles hello_with_volume_lowered.vtt $ edge-tts --volume=-50% --text "Hello, world!" --write-media hello_with_volume_lowered.mp3 --write-subtitles hello_with_volume_lowered.srt
$ edge-tts --pitch=-50Hz --text "Hello, world!" --write-media hello_with_pitch_lowered.mp3 --write-subtitles hello_with_pitch_lowered.vtt $ edge-tts --pitch=-50Hz --text "Hello, world!" --write-media hello_with_pitch_lowered.mp3 --write-subtitles hello_with_pitch_lowered.srt
## Python module ## Python module

View File

@@ -14,7 +14,7 @@ import edge_tts
TEXT = "Hello World!" TEXT = "Hello World!"
VOICE = "en-GB-SoniaNeural" VOICE = "en-GB-SoniaNeural"
OUTPUT_FILE = "test.mp3" OUTPUT_FILE = "test.mp3"
WEBVTT_FILE = "test.vtt" SRT_FILE = "test.srt"
async def amain() -> None: async def amain() -> None:
@@ -26,10 +26,10 @@ async def amain() -> None:
if chunk["type"] == "audio": if chunk["type"] == "audio":
file.write(chunk["data"]) file.write(chunk["data"])
elif chunk["type"] == "WordBoundary": elif chunk["type"] == "WordBoundary":
submaker.create_sub((chunk["offset"], chunk["duration"]), chunk["text"]) submaker.add_cue((chunk["offset"], chunk["duration"]), chunk["text"])
with open(WEBVTT_FILE, "w", encoding="utf-8") as file: with open(SRT_FILE, "w", encoding="utf-8") as file:
file.write(submaker.generate_subs()) file.write(submaker.get_srt())
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -6,6 +6,7 @@ setup(
install_requires=[ install_requires=[
"aiohttp>=3.8.0", "aiohttp>=3.8.0",
"certifi>=2023.11.17", "certifi>=2023.11.17",
"srt>=3.4.1",
"tabulate>=0.4.4", "tabulate>=0.4.4",
"typing-extensions>=4.1.0", "typing-extensions>=4.1.0",
], ],

View File

@@ -25,7 +25,7 @@ def _main() -> None:
keep = os.environ.get("EDGE_PLAYBACK_KEEP_TEMP") is not None keep = os.environ.get("EDGE_PLAYBACK_KEEP_TEMP") is not None
mp3_fname = os.environ.get("EDGE_PLAYBACK_MP3_FILE") mp3_fname = os.environ.get("EDGE_PLAYBACK_MP3_FILE")
vtt_fname = os.environ.get("EDGE_PLAYBACK_VTT_FILE") srt_fname = os.environ.get("EDGE_PLAYBACK_SRT_FILE")
media, subtitle = None, None media, subtitle = None, None
try: try:
if not mp3_fname: if not mp3_fname:
@@ -33,18 +33,18 @@ def _main() -> None:
media.close() media.close()
mp3_fname = media.name mp3_fname = media.name
if not vtt_fname: if not srt_fname:
subtitle = tempfile.NamedTemporaryFile(suffix=".vtt", delete=False) subtitle = tempfile.NamedTemporaryFile(suffix=".srt", delete=False)
subtitle.close() subtitle.close()
vtt_fname = subtitle.name srt_fname = subtitle.name
print(f"Media file: {mp3_fname}") print(f"Media file: {mp3_fname}")
print(f"Subtitle file: {vtt_fname}\n") print(f"Subtitle file: {srt_fname}\n")
with subprocess.Popen( with subprocess.Popen(
[ [
"edge-tts", "edge-tts",
f"--write-media={mp3_fname}", f"--write-media={mp3_fname}",
f"--write-subtitles={vtt_fname}", f"--write-subtitles={srt_fname}",
] ]
+ sys.argv[1:] + sys.argv[1:]
) as process: ) as process:
@@ -53,19 +53,19 @@ def _main() -> None:
with subprocess.Popen( with subprocess.Popen(
[ [
"mpv", "mpv",
f"--sub-file={vtt_fname}", f"--sub-file={srt_fname}",
mp3_fname, mp3_fname,
] ]
) as process: ) as process:
process.communicate() process.communicate()
finally: finally:
if keep: if keep:
print(f"\nKeeping temporary files: {mp3_fname} and {vtt_fname}") print(f"\nKeeping temporary files: {mp3_fname} and {srt_fname}")
else: else:
if mp3_fname is not None and os.path.exists(mp3_fname): if mp3_fname is not None and os.path.exists(mp3_fname):
os.unlink(mp3_fname) os.unlink(mp3_fname)
if vtt_fname is not None and os.path.exists(vtt_fname): if srt_fname is not None and os.path.exists(srt_fname):
os.unlink(vtt_fname) os.unlink(srt_fname)
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -1,33 +1,8 @@
"""SubMaker module is used to generate subtitles from WordBoundary events.""" """SubMaker module is used to generate subtitles from WordBoundary events."""
import math
from typing import List, Tuple from typing import List, Tuple
from xml.sax.saxutils import escape, unescape
import srt # type: ignore
def formatter(start_time: float, end_time: float, subdata: str) -> str:
"""
formatter returns the timecode and the text of the subtitle.
"""
return (
f"{mktimestamp(start_time)} --> {mktimestamp(end_time)}\r\n"
f"{escape(subdata)}\r\n\r\n"
)
def mktimestamp(time_unit: float) -> str:
"""
mktimestamp returns the timecode of the subtitle.
The timecode is in the format of 00:00:00.000.
Returns:
str: The timecode of the subtitle.
"""
hour = math.floor(time_unit / 10**7 / 3600)
minute = math.floor((time_unit / 10**7 / 60) % 60)
seconds = (time_unit / 10**7) % 60
return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
class SubMaker: class SubMaker:
@@ -36,19 +11,11 @@ class SubMaker:
""" """
def __init__(self) -> None: def __init__(self) -> None:
""" self.cues: List[srt.Subtitle] = [] # type: ignore
SubMaker constructor initializes the list of subtitles and the list of offsets.
Returns: def add_cue(self, timestamp: Tuple[float, float], text: str) -> None:
None
""" """
self.offset: List[Tuple[float, float]] = [] Add a subtitle part to the SubMaker object.
self.subs: List[str] = []
def create_sub(self, timestamp: Tuple[float, float], text: str) -> None:
"""
create_sub creates a subtitle from the given timestamp and text,
and appends it to the list of subtitles.
Args: Args:
timestamp (tuple): The offset and duration of the subtitle. timestamp (tuple): The offset and duration of the subtitle.
@@ -57,67 +24,20 @@ class SubMaker:
Returns: Returns:
None None
""" """
self.offset.append((timestamp[0], timestamp[0] + timestamp[1])) self.cues.append(
self.subs.append(text) srt.Subtitle(
index=len(self.cues) + 1,
start=srt.timedelta(microseconds=timestamp[0] / 10),
end=srt.timedelta(microseconds=sum(timestamp) / 10),
content=text,
)
)
def generate_subs(self, words_in_cue: int = 10) -> str: def get_srt(self) -> str:
""" """
generate_subs generates the complete subtitle file. Get the SRT formatted subtitles from the SubMaker object.
Args:
words_in_cue (int): defines the number of words in a given cue
Returns: Returns:
str: The complete subtitle file. str: The SRT formatted subtitles.
""" """
if len(self.subs) != len(self.offset): return srt.compose(self.cues) # type: ignore
raise ValueError("subs and offset are not of the same length")
if words_in_cue <= 0:
raise ValueError("words_in_cue must be greater than 0")
data = "WEBVTT\r\n\r\n"
sub_state_count = 0
sub_state_start = -1.0
sub_state_subs = ""
for idx, (offset, subs) in enumerate(zip(self.offset, self.subs)):
start_time, end_time = offset
subs = unescape(subs)
# wordboundary is guaranteed not to contain whitespace
if len(sub_state_subs) > 0:
sub_state_subs += " "
sub_state_subs += subs
if sub_state_start == -1.0:
sub_state_start = start_time
sub_state_count += 1
if sub_state_count == words_in_cue or idx == len(self.offset) - 1:
subs = sub_state_subs
split_subs: List[str] = [
subs[i : i + 79] for i in range(0, len(subs), 79)
]
for i in range(len(split_subs) - 1):
sub = split_subs[i]
split_at_word = True
if sub[-1] == " ":
split_subs[i] = sub[:-1]
split_at_word = False
if sub[0] == " ":
split_subs[i] = sub[1:]
split_at_word = False
if split_at_word:
split_subs[i] += "-"
data += formatter(
start_time=sub_state_start,
end_time=end_time,
subdata="\r\n".join(split_subs),
)
sub_state_count = 0
sub_state_start = -1
sub_state_subs = ""
return data

View File

@@ -61,7 +61,7 @@ async def _run_tts(args: Any) -> None:
if chunk["type"] == "audio": if chunk["type"] == "audio":
audio_file.write(chunk["data"]) audio_file.write(chunk["data"])
elif chunk["type"] == "WordBoundary": elif chunk["type"] == "WordBoundary":
subs.create_sub((chunk["offset"], chunk["duration"]), chunk["text"]) subs.add_cue((chunk["offset"], chunk["duration"]), chunk["text"])
sub_file: Union[TextIOWrapper, TextIO] = ( sub_file: Union[TextIOWrapper, TextIO] = (
open(args.write_subtitles, "w", encoding="utf-8") open(args.write_subtitles, "w", encoding="utf-8")
@@ -69,7 +69,7 @@ async def _run_tts(args: Any) -> None:
else sys.stderr else sys.stderr
) )
with sub_file: with sub_file:
sub_file.write(subs.generate_subs(args.words_in_cue)) sub_file.write(subs.get_srt())
async def amain() -> None: async def amain() -> None:
@@ -93,12 +93,6 @@ async def amain() -> None:
parser.add_argument("--rate", help="set TTS rate. Default +0%%.", default="+0%") parser.add_argument("--rate", help="set TTS rate. Default +0%%.", default="+0%")
parser.add_argument("--volume", help="set TTS volume. Default +0%%.", default="+0%") parser.add_argument("--volume", help="set TTS volume. Default +0%%.", default="+0%")
parser.add_argument("--pitch", help="set TTS pitch. Default +0Hz.", default="+0Hz") parser.add_argument("--pitch", help="set TTS pitch. Default +0Hz.", default="+0Hz")
parser.add_argument(
"--words-in-cue",
help="number of words in a subtitle cue. Default: 10.",
default=10,
type=float,
)
parser.add_argument( parser.add_argument(
"--write-media", help="send media output to file instead of stdout" "--write-media", help="send media output to file instead of stdout"
) )