fix sentence boundary
This commit is contained in:
2
lint.sh
Executable file
2
lint.sh
Executable file
@@ -0,0 +1,2 @@
|
|||||||
|
find src -name '*.py' | xargs black
|
||||||
|
find src -name '*.py' | xargs isort
|
||||||
@@ -18,16 +18,14 @@ def main():
|
|||||||
with tempfile.NamedTemporaryFile() as media:
|
with tempfile.NamedTemporaryFile() as media:
|
||||||
with tempfile.NamedTemporaryFile() as subtitle:
|
with tempfile.NamedTemporaryFile() as subtitle:
|
||||||
print()
|
print()
|
||||||
print(f"Media file {media.name}")
|
print(f"Media file: {media.name}")
|
||||||
print(f"Subtitle file {subtitle.name}\n")
|
print(f"Subtitle file: {subtitle.name}\n")
|
||||||
with subprocess.Popen(
|
with subprocess.Popen(
|
||||||
[
|
[
|
||||||
"edge-tts",
|
"edge-tts",
|
||||||
"-w",
|
"--boundary-type=2",
|
||||||
"--write-media",
|
f"--write-media={media.name}",
|
||||||
media.name,
|
f"--write-subtitles={subtitle.name}",
|
||||||
"--write-subtitles",
|
|
||||||
subtitle.name,
|
|
||||||
]
|
]
|
||||||
+ sys.argv[1:]
|
+ sys.argv[1:]
|
||||||
) as process:
|
) as process:
|
||||||
|
|||||||
@@ -207,8 +207,7 @@ class Communicate: # pylint: disable=too-few-public-methods
|
|||||||
async def run(
|
async def run(
|
||||||
self,
|
self,
|
||||||
messages,
|
messages,
|
||||||
sentence_boundary=False,
|
boundary_type=0,
|
||||||
word_boundary=False,
|
|
||||||
codec="audio-24khz-48kbitrate-mono-mp3",
|
codec="audio-24khz-48kbitrate-mono-mp3",
|
||||||
voice="Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)",
|
voice="Microsoft Server Speech Text to Speech Voice (en-US, AriaNeural)",
|
||||||
pitch="+0Hz",
|
pitch="+0Hz",
|
||||||
@@ -221,8 +220,7 @@ class Communicate: # pylint: disable=too-few-public-methods
|
|||||||
|
|
||||||
Args:
|
Args:
|
||||||
messages (str or list): A list of SSML strings or a single text.
|
messages (str or list): A list of SSML strings or a single text.
|
||||||
sentence_boundary (bool): Whether to use sentence boundary.
|
boundery_type (int): The type of boundary to use. 0 for none, 1 for word_boundary, 2 for sentence_boundary.
|
||||||
word_boundary (bool): Whether to use word boundary.
|
|
||||||
codec (str): The codec to use.
|
codec (str): The codec to use.
|
||||||
voice (str): The voice to use (only applicable to non-customspeak).
|
voice (str): The voice to use (only applicable to non-customspeak).
|
||||||
pitch (str): The pitch to use (only applicable to non-customspeak).
|
pitch (str): The pitch to use (only applicable to non-customspeak).
|
||||||
@@ -234,11 +232,19 @@ class Communicate: # pylint: disable=too-few-public-methods
|
|||||||
tuple: The subtitle offset, subtitle, and audio data.
|
tuple: The subtitle offset, subtitle, and audio data.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
word_boundary = False
|
||||||
|
sentence_boundary = False
|
||||||
|
|
||||||
|
if boundary_type > 0:
|
||||||
|
word_boundary = True
|
||||||
|
if boundary_type > 1:
|
||||||
|
sentence_boundary = True
|
||||||
|
|
||||||
sentence_boundary = str(sentence_boundary).lower()
|
sentence_boundary = str(sentence_boundary).lower()
|
||||||
word_boundary = str(word_boundary).lower()
|
word_boundary = str(word_boundary).lower()
|
||||||
|
|
||||||
if not customspeak:
|
if not customspeak:
|
||||||
websocket_max_size = 2 ** 16
|
websocket_max_size = 2**16
|
||||||
overhead_per_message = (
|
overhead_per_message = (
|
||||||
len(
|
len(
|
||||||
ssml_headers_plus_data(
|
ssml_headers_plus_data(
|
||||||
@@ -312,6 +318,9 @@ class Communicate: # pylint: disable=too-few-public-methods
|
|||||||
|
|
||||||
# Begin listening for the response.
|
# Begin listening for the response.
|
||||||
download = False
|
download = False
|
||||||
|
current_subtitle = ""
|
||||||
|
first_offset = None
|
||||||
|
last_offset = None
|
||||||
async for received in websocket:
|
async for received in websocket:
|
||||||
if received.type in (
|
if received.type in (
|
||||||
aiohttp.WSMsgType.CLOSED,
|
aiohttp.WSMsgType.CLOSED,
|
||||||
@@ -337,13 +346,53 @@ class Communicate: # pylint: disable=too-few-public-methods
|
|||||||
and parameters["Path"] == "audio.metadata"
|
and parameters["Path"] == "audio.metadata"
|
||||||
):
|
):
|
||||||
metadata = json.loads(data)
|
metadata = json.loads(data)
|
||||||
text = metadata["Metadata"][0]["Data"]["text"]["Text"]
|
metadata_type = metadata["Metadata"][0]["Type"]
|
||||||
offset = metadata["Metadata"][0]["Data"]["Offset"]
|
metadata_offset = metadata["Metadata"][0]["Data"][
|
||||||
yield (
|
"Offset"
|
||||||
offset,
|
]
|
||||||
text,
|
try:
|
||||||
None,
|
metadata_duration = metadata["Metadata"][0]["Data"][
|
||||||
)
|
"Duration"
|
||||||
|
]
|
||||||
|
except KeyError:
|
||||||
|
metadata_duration = 0
|
||||||
|
metadata_text = metadata["Metadata"][0]["Data"]["text"][
|
||||||
|
"Text"
|
||||||
|
]
|
||||||
|
if boundary_type == 1:
|
||||||
|
yield (
|
||||||
|
[
|
||||||
|
metadata_offset,
|
||||||
|
metadata_duration,
|
||||||
|
],
|
||||||
|
metadata_text,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
if metadata_type == "WordBoundary":
|
||||||
|
if current_subtitle:
|
||||||
|
current_subtitle += " "
|
||||||
|
current_subtitle += metadata_text
|
||||||
|
if first_offset is None:
|
||||||
|
first_offset = metadata_offset
|
||||||
|
last_offset = [
|
||||||
|
metadata_offset,
|
||||||
|
metadata_duration,
|
||||||
|
]
|
||||||
|
elif metadata_type == "SentenceBoundary":
|
||||||
|
if current_subtitle:
|
||||||
|
yield (
|
||||||
|
[
|
||||||
|
first_offset,
|
||||||
|
sum(last_offset) - first_offset,
|
||||||
|
],
|
||||||
|
current_subtitle,
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
current_subtitle = ""
|
||||||
|
first_offset = None
|
||||||
|
last_offset = None
|
||||||
|
|
||||||
elif received.type == aiohttp.WSMsgType.BINARY:
|
elif received.type == aiohttp.WSMsgType.BINARY:
|
||||||
if download:
|
if download:
|
||||||
yield (
|
yield (
|
||||||
@@ -353,4 +402,10 @@ class Communicate: # pylint: disable=too-few-public-methods
|
|||||||
received.data.split(b"Path:audio\r\n")[1:]
|
received.data.split(b"Path:audio\r\n")[1:]
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
if current_subtitle:
|
||||||
|
yield (
|
||||||
|
[first_offset, sum(last_offset) - first_offset],
|
||||||
|
current_subtitle,
|
||||||
|
None,
|
||||||
|
)
|
||||||
await websocket.close()
|
await websocket.close()
|
||||||
|
|||||||
@@ -28,9 +28,9 @@ def mktimestamp(time_unit):
|
|||||||
Returns:
|
Returns:
|
||||||
str: The timecode of the subtitle.
|
str: The timecode of the subtitle.
|
||||||
"""
|
"""
|
||||||
hour = math.floor(time_unit / 10 ** 7 / 3600)
|
hour = math.floor(time_unit / 10**7 / 3600)
|
||||||
minute = math.floor((time_unit / 10 ** 7 / 60) % 60)
|
minute = math.floor((time_unit / 10**7 / 60) % 60)
|
||||||
seconds = (time_unit / 10 ** 7) % 60
|
seconds = (time_unit / 10**7) % 60
|
||||||
return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
|
return f"{hour:02d}:{minute:02d}:{seconds:06.3f}"
|
||||||
|
|
||||||
|
|
||||||
@@ -49,7 +49,7 @@ class SubMaker:
|
|||||||
"""
|
"""
|
||||||
self.subs_and_offset = []
|
self.subs_and_offset = []
|
||||||
self.broken_offset = []
|
self.broken_offset = []
|
||||||
self.overlapping = overlapping * (10 ** 7)
|
self.overlapping = overlapping * (10**7)
|
||||||
|
|
||||||
def create_sub(self, timestamp, text):
|
def create_sub(self, timestamp, text):
|
||||||
"""
|
"""
|
||||||
@@ -57,16 +57,19 @@ class SubMaker:
|
|||||||
and adds it to the list of subtitles
|
and adds it to the list of subtitles
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
timestamp (int): The timestamp of the subtitle.
|
timestamp (tuple): The offset and duration of the subtitle.
|
||||||
text (str): The text of the subtitle.
|
text (str): The text of the subtitle.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
None
|
None
|
||||||
"""
|
"""
|
||||||
|
timestamp[1] += timestamp[0]
|
||||||
|
|
||||||
if len(self.subs_and_offset) >= 2:
|
if len(self.subs_and_offset) >= 2:
|
||||||
if self.subs_and_offset[-2] >= timestamp + sum(self.broken_offset):
|
if self.subs_and_offset[-2][-1] >= timestamp[1] + sum(self.broken_offset):
|
||||||
self.broken_offset.append(self.subs_and_offset[-2])
|
self.broken_offset.append(self.subs_and_offset[-2][1])
|
||||||
timestamp = timestamp + sum(self.broken_offset)
|
timestamp[0] += sum(self.broken_offset)
|
||||||
|
timestamp[1] += sum(self.broken_offset)
|
||||||
|
|
||||||
self.subs_and_offset.append(timestamp)
|
self.subs_and_offset.append(timestamp)
|
||||||
self.subs_and_offset.append(text)
|
self.subs_and_offset.append(text)
|
||||||
@@ -80,19 +83,27 @@ class SubMaker:
|
|||||||
"""
|
"""
|
||||||
if len(self.subs_and_offset) >= 2:
|
if len(self.subs_and_offset) >= 2:
|
||||||
data = "WEBVTT\r\n\r\n"
|
data = "WEBVTT\r\n\r\n"
|
||||||
old_time_stamp = None
|
|
||||||
old_sub_data = None
|
|
||||||
for offset, subs in zip(
|
for offset, subs in zip(
|
||||||
self.subs_and_offset[::2], self.subs_and_offset[1::2]
|
self.subs_and_offset[::2], self.subs_and_offset[1::2]
|
||||||
):
|
):
|
||||||
if old_time_stamp is not None and old_sub_data is not None:
|
subs = [subs[i : i + 79] for i in range(0, len(subs), 79)]
|
||||||
data += formatter(
|
|
||||||
old_time_stamp, offset + self.overlapping, old_sub_data
|
for i in range(len(subs) - 1):
|
||||||
)
|
sub = subs[i]
|
||||||
old_time_stamp = offset
|
split_at_word = True
|
||||||
old_sub_data = subs
|
if sub[-1] == " ":
|
||||||
data += formatter(
|
subs[i] = sub[:-1]
|
||||||
old_time_stamp, old_time_stamp + ((10 ** 7) * 10), old_sub_data
|
split_at_word = False
|
||||||
)
|
|
||||||
|
if sub[0] == " ":
|
||||||
|
subs[i] = sub[1:]
|
||||||
|
split_at_word = False
|
||||||
|
|
||||||
|
if split_at_word:
|
||||||
|
subs[i] += "-"
|
||||||
|
|
||||||
|
subs = "\r\n".join(subs)
|
||||||
|
|
||||||
|
data += formatter(offset[0], offset[1] + self.overlapping, subs)
|
||||||
return data
|
return data
|
||||||
return ""
|
return ""
|
||||||
|
|||||||
@@ -32,8 +32,7 @@ async def _tts(args):
|
|||||||
media_file = open(args.write_media, "wb") # pylint: disable=consider-using-with
|
media_file = open(args.write_media, "wb") # pylint: disable=consider-using-with
|
||||||
async for i in tts.run(
|
async for i in tts.run(
|
||||||
args.text,
|
args.text,
|
||||||
args.enable_sentence_boundary,
|
args.boundary_type,
|
||||||
args.enable_word_boundary,
|
|
||||||
args.codec,
|
args.codec,
|
||||||
args.voice,
|
args.voice,
|
||||||
args.pitch,
|
args.pitch,
|
||||||
@@ -108,18 +107,6 @@ async def _main():
|
|||||||
help="set TTS volume. Default +0%%. For more info check https://bit.ly/3eAE5Nx",
|
help="set TTS volume. Default +0%%. For more info check https://bit.ly/3eAE5Nx",
|
||||||
default="+0%",
|
default="+0%",
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
|
||||||
"-s",
|
|
||||||
"--enable-sentence-boundary",
|
|
||||||
help="enable sentence boundary",
|
|
||||||
action="store_true",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"-w",
|
|
||||||
"--enable-word-boundary",
|
|
||||||
help="enable word boundary",
|
|
||||||
action="store_true",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"-O",
|
"-O",
|
||||||
"--overlapping",
|
"--overlapping",
|
||||||
@@ -127,6 +114,13 @@ async def _main():
|
|||||||
default=5,
|
default=5,
|
||||||
type=float,
|
type=float,
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"-b",
|
||||||
|
"--boundary-type",
|
||||||
|
help="set boundary type for subtitles. Default 0 for none. Set 1 for word_boundary, 2 for sentence_boundary",
|
||||||
|
default=0,
|
||||||
|
type=int,
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--write-media", help="instead of stdout, send media output to provided file"
|
"--write-media", help="instead of stdout, send media output to provided file"
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user