Drop words_in_cue code for SubMaker and switch to SRT (#323)

WebVTT isn't a very common format in the first place
and attempting to make WordBoundary play nice with
input text is very hard.

Instead we now just display the word that the TTS
is saying at a given time. In the future, we could
try to enable SentenceBoundary but there is a risk
that it will be banned by Microsoft as it is not used
by Microsoft Edge itself.

Closes: https://github.com/rany2/edge-tts/issues/118
Closes: https://github.com/rany2/edge-tts/issues/171
Closes: https://github.com/rany2/edge-tts/issues/229
Closes: https://github.com/rany2/edge-tts/issues/234

Signed-off-by: rany <rany2@riseup.net>
This commit is contained in:
Rany
2024-11-22 20:58:47 +02:00
committed by GitHub
parent 5a2674cd03
commit 4f5d79ed57
6 changed files with 38 additions and 123 deletions

View File

@@ -1,33 +1,8 @@
"""SubMaker module is used to generate subtitles from WordBoundary events."""
import math
from typing import List, Tuple
from xml.sax.saxutils import escape, unescape
def formatter(start_time: float, end_time: float, subdata: str) -> str:
"""
formatter returns the timecode and the text of the subtitle.
"""
return (
f"{mktimestamp(start_time)} --> {mktimestamp(end_time)}\r\n"
f"{escape(subdata)}\r\n\r\n"
)
def mktimestamp(time_unit: float) -> str:
"""
mktimestamp returns the timecode of the subtitle.
The timecode is in the format of 00:00:00.000.
Returns:
str: The timecode of the subtitle.
"""
hour = math.floor(time_unit / 10**7 / 3600)
minute = math.floor((time_unit / 10**7 / 60) % 60)
seconds = (time_unit / 10**7) % 60
return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
import srt # type: ignore
class SubMaker:
@@ -36,19 +11,11 @@ class SubMaker:
"""
def __init__(self) -> None:
"""
SubMaker constructor initializes the list of subtitles and the list of offsets.
self.cues: List[srt.Subtitle] = [] # type: ignore
Returns:
None
def add_cue(self, timestamp: Tuple[float, float], text: str) -> None:
"""
self.offset: List[Tuple[float, float]] = []
self.subs: List[str] = []
def create_sub(self, timestamp: Tuple[float, float], text: str) -> None:
"""
create_sub creates a subtitle from the given timestamp and text,
and appends it to the list of subtitles.
Add a subtitle part to the SubMaker object.
Args:
timestamp (tuple): The offset and duration of the subtitle.
@@ -57,67 +24,20 @@ class SubMaker:
Returns:
None
"""
self.offset.append((timestamp[0], timestamp[0] + timestamp[1]))
self.subs.append(text)
self.cues.append(
srt.Subtitle(
index=len(self.cues) + 1,
start=srt.timedelta(microseconds=timestamp[0] / 10),
end=srt.timedelta(microseconds=sum(timestamp) / 10),
content=text,
)
)
def generate_subs(self, words_in_cue: int = 10) -> str:
def get_srt(self) -> str:
"""
generate_subs generates the complete subtitle file.
Args:
words_in_cue (int): defines the number of words in a given cue
Get the SRT formatted subtitles from the SubMaker object.
Returns:
str: The complete subtitle file.
str: The SRT formatted subtitles.
"""
if len(self.subs) != len(self.offset):
raise ValueError("subs and offset are not of the same length")
if words_in_cue <= 0:
raise ValueError("words_in_cue must be greater than 0")
data = "WEBVTT\r\n\r\n"
sub_state_count = 0
sub_state_start = -1.0
sub_state_subs = ""
for idx, (offset, subs) in enumerate(zip(self.offset, self.subs)):
start_time, end_time = offset
subs = unescape(subs)
# wordboundary is guaranteed not to contain whitespace
if len(sub_state_subs) > 0:
sub_state_subs += " "
sub_state_subs += subs
if sub_state_start == -1.0:
sub_state_start = start_time
sub_state_count += 1
if sub_state_count == words_in_cue or idx == len(self.offset) - 1:
subs = sub_state_subs
split_subs: List[str] = [
subs[i : i + 79] for i in range(0, len(subs), 79)
]
for i in range(len(split_subs) - 1):
sub = split_subs[i]
split_at_word = True
if sub[-1] == " ":
split_subs[i] = sub[:-1]
split_at_word = False
if sub[0] == " ":
split_subs[i] = sub[1:]
split_at_word = False
if split_at_word:
split_subs[i] += "-"
data += formatter(
start_time=sub_state_start,
end_time=end_time,
subdata="\r\n".join(split_subs),
)
sub_state_count = 0
sub_state_start = -1
sub_state_subs = ""
return data
return srt.compose(self.cues) # type: ignore