diff --git a/edge-tts.py b/edge-tts.py
index c8606f9..a6e8dc1 100755
--- a/edge-tts.py
+++ b/edge-tts.py
@@ -18,135 +18,143 @@ wssUrl = 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/ed
voiceList = 'https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=' + trustedClientToken
def debug(msg, fd=sys.stderr):
- if DEBUG: print(msg, file=fd)
+ if DEBUG: print(msg, file=fd)
def terminator(signo, stack_frame): sys.exit()
signal.signal(signal.SIGINT, terminator)
signal.signal(signal.SIGTERM, terminator)
def connectId(): return str(uuid.uuid4()).replace("-", "")
def removeIncompatibleControlChars(s):
- output = []
- for ch in s:
- # We consider that these control characters are whitespace
- if ch in ['\t','\n','\r']:
- pass
- else:
- abr = unicodedata.category(ch)
- if abr.startswith("C"): continue
- output += [ ch ]
- return "".join(output)
-
-# From https://github.com/pndurette/gTTS/blob/master/gtts/utils.py
-def _minimize(the_string, delim, max_size):
- # Remove `delim` from start of `the_string`
- # i.e. prevent a recursive infinite loop on `the_string[0:0]`
- # if `the_string` starts with `delim` and is larger than `max_size`
- if the_string.startswith(delim):
- the_string = the_string[len(delim):]
-
- if len(the_string) > max_size:
- try:
- # Find the highest index of `delim` in `the_string[0:max_size]`
- # i.e. `the_string` will be cut in half on `delim` index
- idx = the_string.rindex(delim, 0, max_size)
- except ValueError:
- # `delim` not found in `the_string`, index becomes `max_size`
- # i.e. `the_string` will be cut in half arbitrarily on `max_size`
- idx = max_size
- # Call itself again for `the_string[idx:]`
- return [the_string[:idx]] + \
- _minimize(the_string[idx:], delim, max_size)
- else:
- return [the_string]
+ output = []
+ for ch in s:
+ # We consider that these control characters are whitespace
+ if ch in ['\t','\n','\r']:
+ pass
+ else:
+ abr = unicodedata.category(ch)
+ if abr.startswith("C"): continue
+ output += [ ch ]
+ return "".join(output)
def list_voices():
- with urllib.request.urlopen(voiceList) as url:
- debug("Loading json from %s" % voiceList)
- data = json.loads(url.read().decode())
- debug("JSON Loaded")
- for voice in data:
- print()
- for key in voice.keys():
- debug("Processing key %s" % key)
- if key in ["Name", "SuggestedCodec", "FriendlyName", "Status"]:
- debug("Key %s skipped" % key)
- continue
- print("%s: %s" % ("Name" if key == "ShortName" else key, voice[key]))
- print()
+ with urllib.request.urlopen(voiceList) as url:
+ debug("Loading json from %s" % voiceList)
+ data = json.loads(url.read().decode())
+ debug("JSON Loaded")
+ for voice in data:
+ print()
+ for key in voice.keys():
+ debug("Processing key %s" % key)
+ if key in ["Name", "SuggestedCodec", "FriendlyName", "Status"]:
+ debug("Key %s skipped" % key)
+ continue
+ print("%s: %s" % ("Name" if key == "ShortName" else key, voice[key]))
+ print()
-async def run_tts():
- async with websockets.connect(wssUrl, ssl=ssl_context) as ws:
- message='X-Timestamp:'+formatdate()+'\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n'
- message+='{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"'+sentenceBoundaryEnabled+'","wordBoundaryEnabled":"'+wordBoundaryEnabled+'"},"outputFormat":"' + codec + '"}}}}\r\n'
- await ws.send(message)
- debug("> %s" % message)
- message='X-RequestId:'+connectId()+'\r\nContent-Type:application/ssml+xml\r\n'
- message+='X-Timestamp:'+formatdate()+'Z\r\nPath:ssml\r\n\r\n'
- message+=""
- message+="" + "" + escape(text) + ''
- await ws.send(message)
- debug("> %s" % message)
- while True:
- recv = await ws.recv()
- recv = recv.encode() if type(recv) is not bytes else recv
- debug("< %s" % recv)
- if b'turn.end' in recv:
- break
- elif b'Path:audio\r\n' in recv:
- sys.stdout.buffer.write(recv.split(b'Path:audio\r\n')[1])
+def mkssmlmsg(text=""):
+ message='X-RequestId:'+connectId()+'\r\nContent-Type:application/ssml+xml\r\n'
+ message+='X-Timestamp:'+formatdate()+'Z\r\nPath:ssml\r\n\r\n'
+ message+=""
+ message+="" + "" + text + ''
+ return message
+
+async def run_tts(msg):
+ debug("Doing %s!" % msg)
+ async with websockets.connect(wssUrl, ssl=ssl_context) as ws:
+ message='X-Timestamp:'+formatdate()+'\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n'
+ message+='{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"'+sentenceBoundaryEnabled+'","wordBoundaryEnabled":"'+wordBoundaryEnabled+'"},"outputFormat":"' + codec + '"}}}}\r\n'
+ await ws.send(message)
+ debug("> %s" % message)
+ await ws.send(msg)
+ debug("> %s" % msg)
+ while True:
+ recv = await ws.recv()
+ recv = recv.encode() if type(recv) is not bytes else recv
+ debug("< %s" % recv)
+ if b'turn.end' in recv:
+ break
+ elif b'Path:audio\r\n' in recv:
+ sys.stdout.buffer.write(recv.split(b'Path:audio\r\n')[1])
+
+# From https://github.com/pndurette/gTTS/blob/6d9309f05b3ad26ca356654732f3b5b9c3bec538/gtts/utils.py#L13-L54
+def _minimize(the_string, delim, max_size):
+ """Recursively split a string in the largest chunks
+ possible from the highest position of a delimiter all the way
+ to a maximum size
+ Args:
+ the_string (string): The string to split.
+ delim (string): The delimiter to split on.
+ max_size (int): The maximum size of a chunk.
+ Returns:
+ list: the minimized string in tokens
+ Every chunk size will be at minimum ``the_string[0:idx]`` where ``idx``
+ is the highest index of ``delim`` found in ``the_string``; and at maximum
+ ``the_string[0:max_size]`` if no ``delim`` was found in ``the_string``.
+ In the latter case, the split will occur at ``the_string[max_size]``
+ which can be any character. The function runs itself again on the rest of
+ ``the_string`` (``the_string[idx:]``) until no chunk is larger than
+ ``max_size``.
+ """
+ # Remove `delim` from start of `the_string`
+ # i.e. prevent a recursive infinite loop on `the_string[0:0]`
+ # if `the_string` starts with `delim` and is larger than `max_size`
+ if the_string.startswith(delim):
+ the_string = the_string[len(delim):]
+
+ if len(the_string) > max_size:
+ try:
+ # Find the highest index of `delim` in `the_string[0:max_size]`
+ # i.e. `the_string` will be cut in half on `delim` index
+ idx = the_string.rindex(delim, 0, max_size)
+ except ValueError:
+ # `delim` not found in `the_string`, index becomes `max_size`
+ # i.e. `the_string` will be cut in half arbitrarily on `max_size`
+ idx = max_size
+ # Call itself again for `the_string[idx:]`
+ return [the_string[:idx]] + \
+ _minimize(the_string[idx:], delim, max_size)
+ else:
+ return [the_string]
if __name__ == "__main__":
- parser = argparse.ArgumentParser(description="Microsoft Edge's Online TTS Reader")
- group = parser.add_mutually_exclusive_group(required=True)
- group.add_argument('-t', '--text', help='what TTS will say')
- group.add_argument('-f', '--file', help='same as --text but read from file')
- parser.add_argument('-v', '--voice', help='voice for TTS. Default: en-US-AriaNeural', default='en-US-AriaNeural')
- parser.add_argument('-c', '--codec', help="codec format. Default: audio-24khz-48kbitrate-mono-mp3. Another choice is webm-24khz-16bit-mono-opus", default='audio-24khz-48kbitrate-mono-mp3')
- group.add_argument('-l', '--list-voices', help="lists available voices. Edge's list is incomplete so check https://bit.ly/2SFq1d3", action='store_true')
- parser.add_argument('-p', '--pitch', help="set TTS pitch. Default +0Hz, For more info check https://bit.ly/3eAE5Nx", default="+0Hz")
- parser.add_argument('-r', '--rate', help="set TTS rate. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%")
- parser.add_argument('-V', '--volume', help="set TTS volume. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%")
- parser.add_argument('-s', '--enable-sentence-boundary', help="enable sentence boundary (not implemented but set)", action='store_true')
- parser.add_argument('-w', '--enable-word-boundary', help="enable word boundary (not implemented but set)", action='store_true')
- parser.add_argument('-S', '--dont-split-sentences', help="sends entire text as is (careful because limit is unknown)", action='store_true')
- parser.add_argument('-D', '--debug', help="some debugging", action='store_true')
- args = parser.parse_args()
- DEBUG = args.debug
+ parser = argparse.ArgumentParser(description="Microsoft Edge's Online TTS Reader")
+ group = parser.add_mutually_exclusive_group(required=True)
+ group.add_argument('-t', '--text', help='what TTS will say')
+ group.add_argument('-f', '--file', help='same as --text but read from file')
+ parser.add_argument('-v', '--voice', help='voice for TTS. Default: en-US-AriaNeural', default='en-US-AriaNeural')
+ parser.add_argument('-c', '--codec', help="codec format. Default: audio-24khz-48kbitrate-mono-mp3. Another choice is webm-24khz-16bit-mono-opus", default='audio-24khz-48kbitrate-mono-mp3')
+ group.add_argument('-l', '--list-voices', help="lists available voices. Edge's list is incomplete so check https://bit.ly/2SFq1d3", action='store_true')
+ parser.add_argument('-p', '--pitch', help="set TTS pitch. Default +0Hz, For more info check https://bit.ly/3eAE5Nx", default="+0Hz")
+ parser.add_argument('-r', '--rate', help="set TTS rate. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%")
+ parser.add_argument('-V', '--volume', help="set TTS volume. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%")
+ parser.add_argument('-s', '--enable-sentence-boundary', help="enable sentence boundary (not implemented but set)", action='store_true')
+ parser.add_argument('-w', '--enable-word-boundary', help="enable word boundary (not implemented but set)", action='store_true')
+ parser.add_argument('-D', '--debug', help="some debugging", action='store_true')
+ args = parser.parse_args()
+ DEBUG = args.debug
- if (args.text or args.file) is not None:
- if args.file is not None:
- # we need to use sys.stdin.read() because some devices
- # like Windows and Termux don't have a /dev/stdin.
- if args.file == "/dev/stdin":
- debug("stdin detected, reading natively from stdin")
- args.text = sys.stdin.read()
- else:
- debug("reading from %s" % args.file)
- with open(args.file, 'r') as file:
- args.text = file.read()
- codec = args.codec
- voice = args.voice
- pitchString = args.pitch
- rateString = args.rate
- volumeString = args.volume
- sentenceBoundaryEnabled = 'true' if args.enable_sentence_boundary else 'false'
- wordBoundaryEnabled = 'true' if args.enable_word_boundary else 'false'
- # Websocket max is 65536, lets say that overhead is approx. 5k
- max_size = 65536 - 5000
- if not args.dont_split_sentences:
- try:
- from nltk.tokenize import sent_tokenize
- debug("Was able to load nltk module")
- except Exception as e:
- print("You need nltk for sentence splitting.", file=sys.stderr)
- print("If you can't install it you could use the --dont-split-sentences flag.", file=sys.stderr)
- debug("Exception was %s %s" % (e.message, e.args))
- sys.exit(1)
- for text in _minimize(" ".join(sent_tokenize(removeIncompatibleControlChars(args.text))), " ", max_size):
- debug ("Sent %s to be TTSed!" % text)
- asyncio.get_event_loop().run_until_complete(run_tts())
- else:
- for text in _minimize(removeIncompatibleControlChars(args.text), " ", max_size):
- debug ("Sent %s to be TTSed!" % text)
- asyncio.get_event_loop().run_until_complete(run_tts())
- elif args.list_voices:
- list_voices()
+ if (args.text or args.file) is not None:
+ if args.file is not None:
+ # we need to use sys.stdin.read() because some devices
+ # like Windows and Termux don't have a /dev/stdin.
+ if args.file == "/dev/stdin":
+ debug("stdin detected, reading natively from stdin")
+ args.text = sys.stdin.read()
+ else:
+ debug("reading from %s" % args.file)
+ with open(args.file, 'r') as file:
+ args.text = file.read()
+ codec = args.codec
+ voice = args.voice
+ pitchString = args.pitch
+ rateString = args.rate
+ volumeString = args.volume
+ sentenceBoundaryEnabled = 'true' if args.enable_sentence_boundary else 'false'
+ wordBoundaryEnabled = 'true' if args.enable_word_boundary else 'false'
+ # https://hpbn.co/websocket/ says client must also send a masking key,
+ # which adds an extra 4 bytes to the header, resulting in 6–14 bytes over overhead
+ overhead = len(mkssmlmsg()) + 14
+ wsmax = 65536 - overhead
+ for text in _minimize(escape(removeIncompatibleControlChars(args.text)), " ", wsmax):
+ asyncio.get_event_loop().run_until_complete(run_tts(mkssmlmsg(text)))
+ elif args.list_voices:
+ list_voices()