From 21d81aa79ce1fbcca860d8e41815f2a338e3d359 Mon Sep 17 00:00:00 2001 From: rany Date: Sun, 23 May 2021 18:10:10 +0300 Subject: [PATCH] get rid of nltk dependency --- edge-tts.py | 250 +++++++++++++++++++++++++++------------------------- 1 file changed, 129 insertions(+), 121 deletions(-) diff --git a/edge-tts.py b/edge-tts.py index c8606f9..a6e8dc1 100755 --- a/edge-tts.py +++ b/edge-tts.py @@ -18,135 +18,143 @@ wssUrl = 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/ed voiceList = 'https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=' + trustedClientToken def debug(msg, fd=sys.stderr): - if DEBUG: print(msg, file=fd) + if DEBUG: print(msg, file=fd) def terminator(signo, stack_frame): sys.exit() signal.signal(signal.SIGINT, terminator) signal.signal(signal.SIGTERM, terminator) def connectId(): return str(uuid.uuid4()).replace("-", "") def removeIncompatibleControlChars(s): - output = [] - for ch in s: - # We consider that these control characters are whitespace - if ch in ['\t','\n','\r']: - pass - else: - abr = unicodedata.category(ch) - if abr.startswith("C"): continue - output += [ ch ] - return "".join(output) - -# From https://github.com/pndurette/gTTS/blob/master/gtts/utils.py -def _minimize(the_string, delim, max_size): - # Remove `delim` from start of `the_string` - # i.e. prevent a recursive infinite loop on `the_string[0:0]` - # if `the_string` starts with `delim` and is larger than `max_size` - if the_string.startswith(delim): - the_string = the_string[len(delim):] - - if len(the_string) > max_size: - try: - # Find the highest index of `delim` in `the_string[0:max_size]` - # i.e. `the_string` will be cut in half on `delim` index - idx = the_string.rindex(delim, 0, max_size) - except ValueError: - # `delim` not found in `the_string`, index becomes `max_size` - # i.e. `the_string` will be cut in half arbitrarily on `max_size` - idx = max_size - # Call itself again for `the_string[idx:]` - return [the_string[:idx]] + \ - _minimize(the_string[idx:], delim, max_size) - else: - return [the_string] + output = [] + for ch in s: + # We consider that these control characters are whitespace + if ch in ['\t','\n','\r']: + pass + else: + abr = unicodedata.category(ch) + if abr.startswith("C"): continue + output += [ ch ] + return "".join(output) def list_voices(): - with urllib.request.urlopen(voiceList) as url: - debug("Loading json from %s" % voiceList) - data = json.loads(url.read().decode()) - debug("JSON Loaded") - for voice in data: - print() - for key in voice.keys(): - debug("Processing key %s" % key) - if key in ["Name", "SuggestedCodec", "FriendlyName", "Status"]: - debug("Key %s skipped" % key) - continue - print("%s: %s" % ("Name" if key == "ShortName" else key, voice[key])) - print() + with urllib.request.urlopen(voiceList) as url: + debug("Loading json from %s" % voiceList) + data = json.loads(url.read().decode()) + debug("JSON Loaded") + for voice in data: + print() + for key in voice.keys(): + debug("Processing key %s" % key) + if key in ["Name", "SuggestedCodec", "FriendlyName", "Status"]: + debug("Key %s skipped" % key) + continue + print("%s: %s" % ("Name" if key == "ShortName" else key, voice[key])) + print() -async def run_tts(): - async with websockets.connect(wssUrl, ssl=ssl_context) as ws: - message='X-Timestamp:'+formatdate()+'\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n' - message+='{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"'+sentenceBoundaryEnabled+'","wordBoundaryEnabled":"'+wordBoundaryEnabled+'"},"outputFormat":"' + codec + '"}}}}\r\n' - await ws.send(message) - debug("> %s" % message) - message='X-RequestId:'+connectId()+'\r\nContent-Type:application/ssml+xml\r\n' - message+='X-Timestamp:'+formatdate()+'Z\r\nPath:ssml\r\n\r\n' - message+="" - message+="" + "" + escape(text) + '' - await ws.send(message) - debug("> %s" % message) - while True: - recv = await ws.recv() - recv = recv.encode() if type(recv) is not bytes else recv - debug("< %s" % recv) - if b'turn.end' in recv: - break - elif b'Path:audio\r\n' in recv: - sys.stdout.buffer.write(recv.split(b'Path:audio\r\n')[1]) +def mkssmlmsg(text=""): + message='X-RequestId:'+connectId()+'\r\nContent-Type:application/ssml+xml\r\n' + message+='X-Timestamp:'+formatdate()+'Z\r\nPath:ssml\r\n\r\n' + message+="" + message+="" + "" + text + '' + return message + +async def run_tts(msg): + debug("Doing %s!" % msg) + async with websockets.connect(wssUrl, ssl=ssl_context) as ws: + message='X-Timestamp:'+formatdate()+'\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n' + message+='{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"'+sentenceBoundaryEnabled+'","wordBoundaryEnabled":"'+wordBoundaryEnabled+'"},"outputFormat":"' + codec + '"}}}}\r\n' + await ws.send(message) + debug("> %s" % message) + await ws.send(msg) + debug("> %s" % msg) + while True: + recv = await ws.recv() + recv = recv.encode() if type(recv) is not bytes else recv + debug("< %s" % recv) + if b'turn.end' in recv: + break + elif b'Path:audio\r\n' in recv: + sys.stdout.buffer.write(recv.split(b'Path:audio\r\n')[1]) + +# From https://github.com/pndurette/gTTS/blob/6d9309f05b3ad26ca356654732f3b5b9c3bec538/gtts/utils.py#L13-L54 +def _minimize(the_string, delim, max_size): + """Recursively split a string in the largest chunks + possible from the highest position of a delimiter all the way + to a maximum size + Args: + the_string (string): The string to split. + delim (string): The delimiter to split on. + max_size (int): The maximum size of a chunk. + Returns: + list: the minimized string in tokens + Every chunk size will be at minimum ``the_string[0:idx]`` where ``idx`` + is the highest index of ``delim`` found in ``the_string``; and at maximum + ``the_string[0:max_size]`` if no ``delim`` was found in ``the_string``. + In the latter case, the split will occur at ``the_string[max_size]`` + which can be any character. The function runs itself again on the rest of + ``the_string`` (``the_string[idx:]``) until no chunk is larger than + ``max_size``. + """ + # Remove `delim` from start of `the_string` + # i.e. prevent a recursive infinite loop on `the_string[0:0]` + # if `the_string` starts with `delim` and is larger than `max_size` + if the_string.startswith(delim): + the_string = the_string[len(delim):] + + if len(the_string) > max_size: + try: + # Find the highest index of `delim` in `the_string[0:max_size]` + # i.e. `the_string` will be cut in half on `delim` index + idx = the_string.rindex(delim, 0, max_size) + except ValueError: + # `delim` not found in `the_string`, index becomes `max_size` + # i.e. `the_string` will be cut in half arbitrarily on `max_size` + idx = max_size + # Call itself again for `the_string[idx:]` + return [the_string[:idx]] + \ + _minimize(the_string[idx:], delim, max_size) + else: + return [the_string] if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Microsoft Edge's Online TTS Reader") - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument('-t', '--text', help='what TTS will say') - group.add_argument('-f', '--file', help='same as --text but read from file') - parser.add_argument('-v', '--voice', help='voice for TTS. Default: en-US-AriaNeural', default='en-US-AriaNeural') - parser.add_argument('-c', '--codec', help="codec format. Default: audio-24khz-48kbitrate-mono-mp3. Another choice is webm-24khz-16bit-mono-opus", default='audio-24khz-48kbitrate-mono-mp3') - group.add_argument('-l', '--list-voices', help="lists available voices. Edge's list is incomplete so check https://bit.ly/2SFq1d3", action='store_true') - parser.add_argument('-p', '--pitch', help="set TTS pitch. Default +0Hz, For more info check https://bit.ly/3eAE5Nx", default="+0Hz") - parser.add_argument('-r', '--rate', help="set TTS rate. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%") - parser.add_argument('-V', '--volume', help="set TTS volume. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%") - parser.add_argument('-s', '--enable-sentence-boundary', help="enable sentence boundary (not implemented but set)", action='store_true') - parser.add_argument('-w', '--enable-word-boundary', help="enable word boundary (not implemented but set)", action='store_true') - parser.add_argument('-S', '--dont-split-sentences', help="sends entire text as is (careful because limit is unknown)", action='store_true') - parser.add_argument('-D', '--debug', help="some debugging", action='store_true') - args = parser.parse_args() - DEBUG = args.debug + parser = argparse.ArgumentParser(description="Microsoft Edge's Online TTS Reader") + group = parser.add_mutually_exclusive_group(required=True) + group.add_argument('-t', '--text', help='what TTS will say') + group.add_argument('-f', '--file', help='same as --text but read from file') + parser.add_argument('-v', '--voice', help='voice for TTS. Default: en-US-AriaNeural', default='en-US-AriaNeural') + parser.add_argument('-c', '--codec', help="codec format. Default: audio-24khz-48kbitrate-mono-mp3. Another choice is webm-24khz-16bit-mono-opus", default='audio-24khz-48kbitrate-mono-mp3') + group.add_argument('-l', '--list-voices', help="lists available voices. Edge's list is incomplete so check https://bit.ly/2SFq1d3", action='store_true') + parser.add_argument('-p', '--pitch', help="set TTS pitch. Default +0Hz, For more info check https://bit.ly/3eAE5Nx", default="+0Hz") + parser.add_argument('-r', '--rate', help="set TTS rate. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%") + parser.add_argument('-V', '--volume', help="set TTS volume. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%") + parser.add_argument('-s', '--enable-sentence-boundary', help="enable sentence boundary (not implemented but set)", action='store_true') + parser.add_argument('-w', '--enable-word-boundary', help="enable word boundary (not implemented but set)", action='store_true') + parser.add_argument('-D', '--debug', help="some debugging", action='store_true') + args = parser.parse_args() + DEBUG = args.debug - if (args.text or args.file) is not None: - if args.file is not None: - # we need to use sys.stdin.read() because some devices - # like Windows and Termux don't have a /dev/stdin. - if args.file == "/dev/stdin": - debug("stdin detected, reading natively from stdin") - args.text = sys.stdin.read() - else: - debug("reading from %s" % args.file) - with open(args.file, 'r') as file: - args.text = file.read() - codec = args.codec - voice = args.voice - pitchString = args.pitch - rateString = args.rate - volumeString = args.volume - sentenceBoundaryEnabled = 'true' if args.enable_sentence_boundary else 'false' - wordBoundaryEnabled = 'true' if args.enable_word_boundary else 'false' - # Websocket max is 65536, lets say that overhead is approx. 5k - max_size = 65536 - 5000 - if not args.dont_split_sentences: - try: - from nltk.tokenize import sent_tokenize - debug("Was able to load nltk module") - except Exception as e: - print("You need nltk for sentence splitting.", file=sys.stderr) - print("If you can't install it you could use the --dont-split-sentences flag.", file=sys.stderr) - debug("Exception was %s %s" % (e.message, e.args)) - sys.exit(1) - for text in _minimize(" ".join(sent_tokenize(removeIncompatibleControlChars(args.text))), " ", max_size): - debug ("Sent %s to be TTSed!" % text) - asyncio.get_event_loop().run_until_complete(run_tts()) - else: - for text in _minimize(removeIncompatibleControlChars(args.text), " ", max_size): - debug ("Sent %s to be TTSed!" % text) - asyncio.get_event_loop().run_until_complete(run_tts()) - elif args.list_voices: - list_voices() + if (args.text or args.file) is not None: + if args.file is not None: + # we need to use sys.stdin.read() because some devices + # like Windows and Termux don't have a /dev/stdin. + if args.file == "/dev/stdin": + debug("stdin detected, reading natively from stdin") + args.text = sys.stdin.read() + else: + debug("reading from %s" % args.file) + with open(args.file, 'r') as file: + args.text = file.read() + codec = args.codec + voice = args.voice + pitchString = args.pitch + rateString = args.rate + volumeString = args.volume + sentenceBoundaryEnabled = 'true' if args.enable_sentence_boundary else 'false' + wordBoundaryEnabled = 'true' if args.enable_word_boundary else 'false' + # https://hpbn.co/websocket/ says client must also send a masking key, + # which adds an extra 4 bytes to the header, resulting in 6–14 bytes over overhead + overhead = len(mkssmlmsg()) + 14 + wsmax = 65536 - overhead + for text in _minimize(escape(removeIncompatibleControlChars(args.text)), " ", wsmax): + asyncio.get_event_loop().run_until_complete(run_tts(mkssmlmsg(text))) + elif args.list_voices: + list_voices()