diff --git a/.gitignore b/.gitignore
index c4f0b14..9aa6859 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,10 @@
**
-!install.sh
-!easy-playback.sh
+!edge-playback.sh
!edge-tts.py
-!edgeTTS.py
+!example
+!example/**
!LICENSE
-!.gitignore
+!README.md
+!setup.py
+!src
+!src/**
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..90cabb4
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# edge-tts
+
+Use Microsoft Edge's Online TTS from the Terminal or as a python module.
diff --git a/easy-playback.sh b/edge-playback.sh
similarity index 100%
rename from easy-playback.sh
rename to edge-playback.sh
diff --git a/edge-tts.py b/edge-tts.py
deleted file mode 100755
index bbf5c94..0000000
--- a/edge-tts.py
+++ /dev/null
@@ -1,153 +0,0 @@
-#!/usr/bin/env python3
-import sys
-import json
-import uuid
-import signal
-import argparse
-import urllib.request
-import asyncio
-import ssl
-import websockets
-import unicodedata
-import logging
-from email.utils import formatdate
-from xml.sax.saxutils import escape
-
-ssl_context = ssl.create_default_context()
-trustedClientToken = '6A5AA1D4EAFF4E9FB37E23D68491D6F4'
-wssUrl = 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=' + trustedClientToken
-voiceList = 'https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=' + trustedClientToken
-
-def terminator(signo, stack_frame): sys.exit()
-signal.signal(signal.SIGINT, terminator)
-signal.signal(signal.SIGTERM, terminator)
-def connectId(): return str(uuid.uuid4()).replace("-", "")
-def removeIncompatibleControlChars(s):
- output = []
- for ch in s:
- # We consider that these control characters are whitespace
- if ch in ['\t','\n','\r']:
- pass
- else:
- abr = unicodedata.category(ch)
- if abr.startswith("C"): continue
- output += [ ch ]
- return "".join(output)
-
-def list_voices():
- with urllib.request.urlopen(voiceList) as url:
- logging.debug("Loading json from %s" % voiceList)
- data = json.loads(url.read().decode('utf-8'))
- logging.debug("JSON Loaded")
- return data
-
-def mkssmlmsg(text="", voice="en-US-AriaNeural", pitchString="+0Hz", rateString="+0%", volumeString="+0%", customspeak=False):
- message='X-RequestId:'+connectId()+'\r\nContent-Type:application/ssml+xml\r\n'
- message+='X-Timestamp:'+formatdate()+'Z\r\nPath:ssml\r\n\r\n'
- if customspeak:
- message+=text
- else:
- message+=""
- message+="" + "" + text + ''
- return message
-
-async def run_tts(msg, sentenceBoundaryEnabled="false", wordBoundaryEnabled="false", codec="audio-24khz-48kbitrate-mono-mp3"):
- logging.debug("Doing %s!" % msg)
- async with websockets.connect(wssUrl, ssl=ssl_context) as ws:
- message='X-Timestamp:'+formatdate()+'\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n'
- message+='{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"'+sentenceBoundaryEnabled+'","wordBoundaryEnabled":"'+wordBoundaryEnabled+'"},"outputFormat":"' + codec + '"}}}}\r\n'
- await ws.send(message)
- logging.debug("> %s" % message)
- await ws.send(msg)
- logging.debug("> %s" % msg)
- async for recv in ws:
- recv = recv.encode('utf-8') if type(recv) is not bytes else recv
- logging.debug("< %s" % recv)
- if b'turn.end' in recv:
- await ws.close()
- elif b'Path:audio\r\n' in recv:
- yield b"".join(recv.split(b'Path:audio\r\n')[1:])
-
-# Based on https://github.com/pndurette/gTTS/blob/6d9309f05b3ad26ca356654732f3b5b9c3bec538/gtts/utils.py#L13-L54
-# Modified to measure based on bytes rather than number of characters
-def _minimize(the_string, delim, max_size):
- # Make sure we are measuring based on bytes
- the_string = the_string.encode('utf-8') if type(the_string) is str else the_string
-
- if the_string.startswith(delim):
- the_string = the_string[len(delim):]
-
- if len(the_string) > max_size:
- try:
- # Find the highest index of `delim` in `the_string[0:max_size]`
- # i.e. `the_string` will be cut in half on `delim` index
- idx = the_string.rindex(delim, 0, max_size)
- except ValueError:
- # `delim` not found in `the_string`, index becomes `max_size`
- # i.e. `the_string` will be cut in half arbitrarily on `max_size`
- idx = max_size
- # Call itself again for `the_string[idx:]`
- return [the_string[:idx]] + \
- _minimize(the_string[idx:], delim, max_size)
- else:
- return [the_string]
-
-async def main():
- parser = argparse.ArgumentParser(description="Microsoft Edge's Online TTS Reader")
- group = parser.add_mutually_exclusive_group(required=True)
- group.add_argument('-t', '--text', help='what TTS will say')
- group.add_argument('-f', '--file', help='same as --text but read from file')
- parser.add_argument(
- "-L",
- "--log-level",
- default=logging.CRITICAL,
- type=lambda x: getattr(logging, x),
- help="configure the logging level (currently only DEBUG supported)"
- )
- parser.add_argument('-z', '--custom-ssml', help='treat text as ssml to send. For more info check https://bit.ly/3fIq13S', action='store_true')
- parser.add_argument('-v', '--voice', help='voice for TTS. Default: en-US-AriaNeural', default='en-US-AriaNeural')
- parser.add_argument('-c', '--codec', help="codec format. Default: audio-24khz-48kbitrate-mono-mp3. Another choice is webm-24khz-16bit-mono-opus", default='audio-24khz-48kbitrate-mono-mp3')
- group.add_argument('-l', '--list-voices', help="lists available voices. Edge's list is incomplete so check https://bit.ly/2SFq1d3", action='store_true')
- parser.add_argument('-p', '--pitch', help="set TTS pitch. Default +0Hz, For more info check https://bit.ly/3eAE5Nx", default="+0Hz")
- parser.add_argument('-r', '--rate', help="set TTS rate. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%")
- parser.add_argument('-V', '--volume', help="set TTS volume. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%")
- parser.add_argument('-s', '--enable-sentence-boundary', help="enable sentence boundary (not implemented but settable)", action='store_true')
- parser.add_argument('-w', '--enable-word-boundary', help="enable word boundary (not implemented but settable)", action='store_true')
- args = parser.parse_args()
- logging.basicConfig(level=args.log_level)
- if (args.text or args.file) is not None:
- if args.file is not None:
- # we need to use sys.stdin.read() because some devices
- # like Windows and Termux don't have a /dev/stdin.
- if args.file == "/dev/stdin":
- logging.debug("stdin detected, reading natively from stdin")
- args.text = sys.stdin.read()
- else:
- logging.debug("reading from %s" % args.file)
- with open(args.file, 'r') as file:
- args.text = file.read()
- sentenceBoundaryEnabled = 'true' if args.enable_sentence_boundary else 'false'
- wordBoundaryEnabled = 'true' if args.enable_word_boundary else 'false'
- if args.custom_ssml:
- async for i in run_tts(mkssmlmsg(text=args.text, customspeak=True), sentenceBoundaryEnabled, wordBoundaryEnabled, args.codec):
- sys.stdout.buffer.write(i)
- else:
- overhead = len(mkssmlmsg('', args.voice, args.pitch, args.rate, args.volume).encode('utf-8'))
- wsmax = 65536 - overhead
- for text in _minimize(escape(removeIncompatibleControlChars(args.text)), b" ", wsmax):
- async for i in run_tts(mkssmlmsg(text.decode('utf-8'), args.voice, args.pitch, args.rate, args.volume), sentenceBoundaryEnabled, wordBoundaryEnabled, args.codec):
- sys.stdout.buffer.write(i)
- elif args.list_voices:
- seperator = False
- for voice in list_voices():
- if seperator: print()
- for key in voice.keys():
- logging.debug("Processing key %s" % key)
- if key in ["Name", "SuggestedCodec", "FriendlyName", "Status"]:
- logging.debug("Key %s skipped" % key)
- continue
- print("%s: %s" % ("Name" if key == "ShortName" else key, voice[key]))
- seperator = True
-
-if __name__ == "__main__":
- asyncio.get_event_loop().run_until_complete(main())
diff --git a/edge-tts.py b/edge-tts.py
new file mode 120000
index 0000000..cea074f
--- /dev/null
+++ b/edge-tts.py
@@ -0,0 +1 @@
+src/edgeTTS/__init__.py
\ No newline at end of file
diff --git a/edgeTTS.py b/edgeTTS.py
deleted file mode 120000
index 59373cb..0000000
--- a/edgeTTS.py
+++ /dev/null
@@ -1 +0,0 @@
-edge-tts.py
\ No newline at end of file
diff --git a/example/input_example.py b/example/input_example.py
new file mode 100644
index 0000000..f3e5d74
--- /dev/null
+++ b/example/input_example.py
@@ -0,0 +1,19 @@
+#!/usr/bin/env python3
+
+# Example Python script that shows how to use edge-tts as a module
+
+import asyncio
+import edgeTTS
+import time
+import tempfile
+from playsound import playsound
+
+async def main():
+ ask = input("What do you want TTS to say? ")
+ with tempfile.NamedTemporaryFile() as fp:
+ async for i in edgeTTS.run_tts(edgeTTS.mkssmlmsg(ask)): # default Aria, audio-24khz-48kbitrate-mono-mp3, etc..
+ fp.write(i)
+ playsound(fp.name)
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/install.sh b/install.sh
deleted file mode 100755
index a8926b7..0000000
--- a/install.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#!/bin/sh
-{ [ -e "edge-tts.py" ] && [ -e "easy-playback.sh" ]; } || { echo "Script needs to be run on root of the repo" >&2; exit 1; }
-[ -z "$1" ] && { echo "You need to specify the install path." >&2; exit 1; }
-mkdir -p -- "$1" 2>/dev/null
-rm -f -- "$1/edge-tts" "$1/easy-playback"
-cp -f -- edge-tts.py "$1/edge-tts"
-cp -f -- easy-playback.sh "$1/edge-playback"
-chmod +x -- "$1/edge-tts" "$1/edge-playback"
-exit 0
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e908249
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,26 @@
+import setuptools
+
+with open("README.md", "r", encoding="utf-8") as fh:
+ long_description = fh.read()
+
+setuptools.setup(
+ name="edgeTTS-rany",
+ version="0.0.1",
+ author="rany",
+ author_email="ranygh@riseup.net",
+ description="Microsoft Edge's TTS",
+ long_description=long_description,
+ long_description_content_type="text/markdown",
+ url="https://github.com/rany2/edge-tts",
+ project_urls={
+ "Bug Tracker": "https://github.com/rany2/edge-tts/issues",
+ },
+ classifiers=[
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: GPLv3 License",
+ "Operating System :: OS Independent",
+ ],
+ package_dir={"": "src"},
+ packages=setuptools.find_packages(where="src"),
+ python_requires=">=3.6",
+)
diff --git a/src/edgeTTS/__init__.py b/src/edgeTTS/__init__.py
new file mode 100755
index 0000000..bbf5c94
--- /dev/null
+++ b/src/edgeTTS/__init__.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+import sys
+import json
+import uuid
+import signal
+import argparse
+import urllib.request
+import asyncio
+import ssl
+import websockets
+import unicodedata
+import logging
+from email.utils import formatdate
+from xml.sax.saxutils import escape
+
+ssl_context = ssl.create_default_context()
+trustedClientToken = '6A5AA1D4EAFF4E9FB37E23D68491D6F4'
+wssUrl = 'wss://speech.platform.bing.com/consumer/speech/synthesize/readaloud/edge/v1?TrustedClientToken=' + trustedClientToken
+voiceList = 'https://speech.platform.bing.com/consumer/speech/synthesize/readaloud/voices/list?trustedclienttoken=' + trustedClientToken
+
+def terminator(signo, stack_frame): sys.exit()
+signal.signal(signal.SIGINT, terminator)
+signal.signal(signal.SIGTERM, terminator)
+def connectId(): return str(uuid.uuid4()).replace("-", "")
+def removeIncompatibleControlChars(s):
+ output = []
+ for ch in s:
+ # We consider that these control characters are whitespace
+ if ch in ['\t','\n','\r']:
+ pass
+ else:
+ abr = unicodedata.category(ch)
+ if abr.startswith("C"): continue
+ output += [ ch ]
+ return "".join(output)
+
+def list_voices():
+ with urllib.request.urlopen(voiceList) as url:
+ logging.debug("Loading json from %s" % voiceList)
+ data = json.loads(url.read().decode('utf-8'))
+ logging.debug("JSON Loaded")
+ return data
+
+def mkssmlmsg(text="", voice="en-US-AriaNeural", pitchString="+0Hz", rateString="+0%", volumeString="+0%", customspeak=False):
+ message='X-RequestId:'+connectId()+'\r\nContent-Type:application/ssml+xml\r\n'
+ message+='X-Timestamp:'+formatdate()+'Z\r\nPath:ssml\r\n\r\n'
+ if customspeak:
+ message+=text
+ else:
+ message+=""
+ message+="" + "" + text + ''
+ return message
+
+async def run_tts(msg, sentenceBoundaryEnabled="false", wordBoundaryEnabled="false", codec="audio-24khz-48kbitrate-mono-mp3"):
+ logging.debug("Doing %s!" % msg)
+ async with websockets.connect(wssUrl, ssl=ssl_context) as ws:
+ message='X-Timestamp:'+formatdate()+'\r\nContent-Type:application/json; charset=utf-8\r\nPath:speech.config\r\n\r\n'
+ message+='{"context":{"synthesis":{"audio":{"metadataoptions":{"sentenceBoundaryEnabled":"'+sentenceBoundaryEnabled+'","wordBoundaryEnabled":"'+wordBoundaryEnabled+'"},"outputFormat":"' + codec + '"}}}}\r\n'
+ await ws.send(message)
+ logging.debug("> %s" % message)
+ await ws.send(msg)
+ logging.debug("> %s" % msg)
+ async for recv in ws:
+ recv = recv.encode('utf-8') if type(recv) is not bytes else recv
+ logging.debug("< %s" % recv)
+ if b'turn.end' in recv:
+ await ws.close()
+ elif b'Path:audio\r\n' in recv:
+ yield b"".join(recv.split(b'Path:audio\r\n')[1:])
+
+# Based on https://github.com/pndurette/gTTS/blob/6d9309f05b3ad26ca356654732f3b5b9c3bec538/gtts/utils.py#L13-L54
+# Modified to measure based on bytes rather than number of characters
+def _minimize(the_string, delim, max_size):
+ # Make sure we are measuring based on bytes
+ the_string = the_string.encode('utf-8') if type(the_string) is str else the_string
+
+ if the_string.startswith(delim):
+ the_string = the_string[len(delim):]
+
+ if len(the_string) > max_size:
+ try:
+ # Find the highest index of `delim` in `the_string[0:max_size]`
+ # i.e. `the_string` will be cut in half on `delim` index
+ idx = the_string.rindex(delim, 0, max_size)
+ except ValueError:
+ # `delim` not found in `the_string`, index becomes `max_size`
+ # i.e. `the_string` will be cut in half arbitrarily on `max_size`
+ idx = max_size
+ # Call itself again for `the_string[idx:]`
+ return [the_string[:idx]] + \
+ _minimize(the_string[idx:], delim, max_size)
+ else:
+ return [the_string]
+
+async def main():
+ parser = argparse.ArgumentParser(description="Microsoft Edge's Online TTS Reader")
+ group = parser.add_mutually_exclusive_group(required=True)
+ group.add_argument('-t', '--text', help='what TTS will say')
+ group.add_argument('-f', '--file', help='same as --text but read from file')
+ parser.add_argument(
+ "-L",
+ "--log-level",
+ default=logging.CRITICAL,
+ type=lambda x: getattr(logging, x),
+ help="configure the logging level (currently only DEBUG supported)"
+ )
+ parser.add_argument('-z', '--custom-ssml', help='treat text as ssml to send. For more info check https://bit.ly/3fIq13S', action='store_true')
+ parser.add_argument('-v', '--voice', help='voice for TTS. Default: en-US-AriaNeural', default='en-US-AriaNeural')
+ parser.add_argument('-c', '--codec', help="codec format. Default: audio-24khz-48kbitrate-mono-mp3. Another choice is webm-24khz-16bit-mono-opus", default='audio-24khz-48kbitrate-mono-mp3')
+ group.add_argument('-l', '--list-voices', help="lists available voices. Edge's list is incomplete so check https://bit.ly/2SFq1d3", action='store_true')
+ parser.add_argument('-p', '--pitch', help="set TTS pitch. Default +0Hz, For more info check https://bit.ly/3eAE5Nx", default="+0Hz")
+ parser.add_argument('-r', '--rate', help="set TTS rate. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%")
+ parser.add_argument('-V', '--volume', help="set TTS volume. Default +0%%. For more info check https://bit.ly/3eAE5Nx", default="+0%")
+ parser.add_argument('-s', '--enable-sentence-boundary', help="enable sentence boundary (not implemented but settable)", action='store_true')
+ parser.add_argument('-w', '--enable-word-boundary', help="enable word boundary (not implemented but settable)", action='store_true')
+ args = parser.parse_args()
+ logging.basicConfig(level=args.log_level)
+ if (args.text or args.file) is not None:
+ if args.file is not None:
+ # we need to use sys.stdin.read() because some devices
+ # like Windows and Termux don't have a /dev/stdin.
+ if args.file == "/dev/stdin":
+ logging.debug("stdin detected, reading natively from stdin")
+ args.text = sys.stdin.read()
+ else:
+ logging.debug("reading from %s" % args.file)
+ with open(args.file, 'r') as file:
+ args.text = file.read()
+ sentenceBoundaryEnabled = 'true' if args.enable_sentence_boundary else 'false'
+ wordBoundaryEnabled = 'true' if args.enable_word_boundary else 'false'
+ if args.custom_ssml:
+ async for i in run_tts(mkssmlmsg(text=args.text, customspeak=True), sentenceBoundaryEnabled, wordBoundaryEnabled, args.codec):
+ sys.stdout.buffer.write(i)
+ else:
+ overhead = len(mkssmlmsg('', args.voice, args.pitch, args.rate, args.volume).encode('utf-8'))
+ wsmax = 65536 - overhead
+ for text in _minimize(escape(removeIncompatibleControlChars(args.text)), b" ", wsmax):
+ async for i in run_tts(mkssmlmsg(text.decode('utf-8'), args.voice, args.pitch, args.rate, args.volume), sentenceBoundaryEnabled, wordBoundaryEnabled, args.codec):
+ sys.stdout.buffer.write(i)
+ elif args.list_voices:
+ seperator = False
+ for voice in list_voices():
+ if seperator: print()
+ for key in voice.keys():
+ logging.debug("Processing key %s" % key)
+ if key in ["Name", "SuggestedCodec", "FriendlyName", "Status"]:
+ logging.debug("Key %s skipped" % key)
+ continue
+ print("%s: %s" % ("Name" if key == "ShortName" else key, voice[key]))
+ seperator = True
+
+if __name__ == "__main__":
+ asyncio.get_event_loop().run_until_complete(main())