drop custom SSML support

This commit is contained in:
rany2
2022-06-19 21:06:55 +03:00
parent 797d04f182
commit 68a9e191d7
4 changed files with 24 additions and 66 deletions

View File

@@ -61,35 +61,11 @@ You must first check the available voices with the `--list-voices` option:
### Custom SSML ### Custom SSML
It is possible to send Microsoft's text-to-speech servers a custom SSML document which would allow greater customization of the speech. Support for custom SSML has been removed since 5.0.0 because Microsoft has taken the initiative to prevent it from working. You cannot use custom SSML anymore.
Information about the SSML format can be found here on Microsoft's own website: https://docs.microsoft.com/en-us/azure/cognitive-services/speech-service/speech-synthesis-markup
As a short example, if you want to apply the following SSML document and play it back using `edge-tts`.
```
<speak version="1.0" xmlns="http://www.w3.org/2001/10/synthesis"
xmlns:mstts="https://www.w3.org/2001/mstts" xml:lang="en-US">
<voice name="en-US-AriaNeural">
<mstts:express-as style="cheerful">
That'd be just amazing!
</mstts:express-as>
</voice>
</speak>
```
It would be easiest to do the following:
1. Create a file called `custom_ssml.xml` with the above content.
2. Run the following command:
$ edge-tts --custom-ssml --file custom_ssml.xml --write-media amazing.mp3
3. Voila!
### Changing pitch, rate, volume, etc. ### Changing pitch, rate, volume, etc.
It is possible to make minor changes to the generated speech without resorting to custom SSML. However, you must note that you couldn't use the `--custom-ssml` option with the `--pitch`, `--rate`, `--volume`, etc. options. It is possible to make minor changes to the generated speech.
$ edge-tts --pitch=-10Hz --text "Hello, world!" --write-media hello_with_pitch_down.mp3 $ edge-tts --pitch=-10Hz --text "Hello, world!" --write-media hello_with_pitch_down.mp3
$ edge-tts --rate=0.5 --text "Hello, world!" --write-media hello_with_rate_halved.mp3 $ edge-tts --rate=0.5 --text "Hello, world!" --write-media hello_with_rate_halved.mp3

View File

@@ -1,6 +1,6 @@
[metadata] [metadata]
name = edge-tts name = edge-tts
version = 4.0.11 version = 5.0.0
author = rany author = rany
author_email = ranygh@riseup.net author_email = ranygh@riseup.net
description = Microsoft Edge's TTS description = Microsoft Edge's TTS

View File

@@ -213,7 +213,6 @@ class Communicate:
pitch="+0Hz", pitch="+0Hz",
rate="+0%", rate="+0%",
volume="+0%", volume="+0%",
customspeak=False,
proxy=None, proxy=None,
): ):
""" """
@@ -223,11 +222,10 @@ class Communicate:
messages (str or list): A list of SSML strings or a single text. messages (str or list): A list of SSML strings or a single text.
boundery_type (int): The type of boundary to use. 0 for none, 1 for word_boundary, 2 for sentence_boundary. boundery_type (int): The type of boundary to use. 0 for none, 1 for word_boundary, 2 for sentence_boundary.
codec (str): The codec to use. codec (str): The codec to use.
voice (str): The voice to use (only applicable to non-customspeak). voice (str): The voice to use.
pitch (str): The pitch to use (only applicable to non-customspeak). pitch (str): The pitch to use.
rate (str): The rate to use (only applicable to non-customspeak). rate (str): The rate to use.
volume (str): The volume to use (only applicable to non-customspeak). volume (str): The volume to use.
customspeak (bool): Whether to create the SSML or treat the messages as SSML.
Yields: Yields:
tuple: The subtitle offset, subtitle, and audio data. tuple: The subtitle offset, subtitle, and audio data.
@@ -244,23 +242,19 @@ class Communicate:
word_boundary = str(word_boundary).lower() word_boundary = str(word_boundary).lower()
if not customspeak: websocket_max_size = 2**16
websocket_max_size = 2**16 overhead_per_message = (
overhead_per_message = ( len(
len( ssml_headers_plus_data(
ssml_headers_plus_data( connect_id(), self.date, mkssml("", voice, pitch, rate, volume)
connect_id(), self.date, mkssml("", voice, pitch, rate, volume)
)
) )
+ 50
) # margin of error
messages = split_text_by_byte_length(
escape(remove_incompatible_characters(messages)),
websocket_max_size - overhead_per_message,
) )
else: + 50
if isinstance(messages, str): ) # margin of error
messages = [messages] messages = split_text_by_byte_length(
escape(remove_incompatible_characters(messages)),
websocket_max_size - overhead_per_message,
)
# Variables for the loop # Variables for the loop
download = False download = False
@@ -307,18 +301,13 @@ class Communicate:
# Send the request to the service. # Send the request to the service.
await websocket.send_str(request) await websocket.send_str(request)
# Send the message itself. # Send the message itself.
if not customspeak: await websocket.send_str(
await websocket.send_str( ssml_headers_plus_data(
ssml_headers_plus_data( connect_id(),
connect_id(), self.date,
self.date, mkssml(message, voice, pitch, rate, volume),
mkssml(message, voice, pitch, rate, volume),
)
)
else:
await websocket.send_str(
ssml_headers_plus_data(connect_id(), self.date, message)
) )
)
# Begin listening for the response. # Begin listening for the response.
async for received in websocket: async for received in websocket:

View File

@@ -38,7 +38,6 @@ async def _tts(args):
args.pitch, args.pitch,
args.rate, args.rate,
args.volume, args.volume,
customspeak=args.custom_ssml,
proxy=args.proxy, proxy=args.proxy,
): ):
if i[2] is not None: if i[2] is not None:
@@ -62,12 +61,6 @@ async def _main():
group = parser.add_mutually_exclusive_group(required=True) group = parser.add_mutually_exclusive_group(required=True)
group.add_argument("-t", "--text", help="what TTS will say") group.add_argument("-t", "--text", help="what TTS will say")
group.add_argument("-f", "--file", help="same as --text but read from file") group.add_argument("-f", "--file", help="same as --text but read from file")
parser.add_argument(
"-z",
"--custom-ssml",
help="treat text as ssml to send. For more info check https://bit.ly/3fIq13S",
action="store_true",
)
parser.add_argument( parser.add_argument(
"-v", "-v",
"--voice", "--voice",