354 lines
12 KiB
Plaintext
354 lines
12 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "b94f103d-ac43-4d13-83cb-eb5090220881",
|
|
"metadata": {},
|
|
"source": [
|
|
"# EdgeTTS\n",
|
|
"\n",
|
|
"https://github.com/rany2/edge-tts\n",
|
|
"\n",
|
|
"edge-tts is a Python module that allows you to use Microsoft Edge's online text-to-speech service from within your Python code or using the provided edge-tts or edge-playback command."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "77deb08f-fec3-4327-b2f9-1c893aacaddc",
|
|
"metadata": {
|
|
"collapsed": true,
|
|
"jupyter": {
|
|
"outputs_hidden": true
|
|
}
|
|
},
|
|
"outputs": [],
|
|
"source": [
|
|
"%pip install edge-tts pygame"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 49,
|
|
"id": "71d35cd9",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['en-US-GuyNeural', 'en-US-AriaNeural']\n",
|
|
"hello\n",
|
|
"../audios/hello-us-male.mp3 created\n",
|
|
"../audios/hello-us-female.mp3 created\n",
|
|
"\n",
|
|
"heat\n",
|
|
"../audios/heat-us-male.mp3 created\n",
|
|
"../audios/heat-us-female.mp3 created\n",
|
|
"\n",
|
|
"high\n",
|
|
"../audios/high-us-male.mp3 created\n",
|
|
"../audios/high-us-female.mp3 created\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import edge_tts\n",
|
|
"import os\n",
|
|
"import pygame\n",
|
|
"import time\n",
|
|
"\n",
|
|
"async def generate_edge_tts_audio(text, file_name, voice='en-US-GuyNeural', style='newscast-formal', verbose=False, play=False, overwrite=False):\n",
|
|
" communicate = edge_tts.Communicate(text, voice)\n",
|
|
" # whether file exists?\n",
|
|
" if os.path.exists(file_name):\n",
|
|
" if overwrite:\n",
|
|
" if verbose:\n",
|
|
" print(f'{file_name} exists, overwriting...')\n",
|
|
" else:\n",
|
|
" if verbose:\n",
|
|
" print(f'{file_name} exists, skipping...')\n",
|
|
" return\n",
|
|
" \n",
|
|
" await communicate.save(file_name)\n",
|
|
" if play:\n",
|
|
" pygame.mixer.init()\n",
|
|
" pygame.mixer.music.load(file_name)\n",
|
|
" pygame.mixer.music.play()\n",
|
|
" if verbose:\n",
|
|
" print(f'{file_name} created')\n",
|
|
" \n",
|
|
" time.sleep(1.5)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 84,
|
|
"id": "4146f92e",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"['en-US-GuyNeural', 'en-US-AriaNeural']\n",
|
|
"important\n",
|
|
"../audios/important-us-male.mp3 created\n",
|
|
"../audios/important-us-female.mp3 created\n"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"voices = [\"en-US-GuyNeural\", \"en-US-AriaNeural\", \"en-GB-RyanNeural\", \"en-GB-LibbyNeural\"]\n",
|
|
"regions = ['us', 'us', 'uk', 'uk']\n",
|
|
"genders = ['male', 'female', 'male', 'female']\n",
|
|
"\n",
|
|
"# only_us = False\n",
|
|
"only_us = True\n",
|
|
"if only_us:\n",
|
|
" voices = voices[:2]\n",
|
|
" print(voices)\n",
|
|
"\n",
|
|
"words = \"\"\"\n",
|
|
"important\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"for word in words.strip().split(','):\n",
|
|
" print(word)\n",
|
|
" for i, voice in enumerate(voices):\n",
|
|
" w = word.strip().lower()\n",
|
|
" if len(w) > 0:\n",
|
|
" filename = f'../audios/{w.replace(\" \", \"-\")}-{regions[i]}-{genders[i]}.mp3'\n",
|
|
" await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 69,
|
|
"id": "2d46cde4",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def get_openai_tts_audio(text, path, performer='alloy'):\n",
|
|
" \n",
|
|
" from openai import OpenAI\n",
|
|
" from dotenv import load_dotenv\n",
|
|
" load_dotenv()\n",
|
|
" client = OpenAI(\n",
|
|
" )\n",
|
|
" \n",
|
|
" with client.audio.speech.with_streaming_response.create(\n",
|
|
" model=\"tts-1\",\n",
|
|
" voice=performer,\n",
|
|
" input=text.strip()\n",
|
|
" ) as response:\n",
|
|
" response.stream_to_file(path)\n",
|
|
" \n",
|
|
"sentence = \"It's a very important aspect\"\n",
|
|
"\n",
|
|
"# remove all punctuation at the end of sentence,\n",
|
|
"# replace all spaces and punctuations in the sentence with dash\n",
|
|
"# audio_filename_openai = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_openai.mp3'\n",
|
|
"# audio_filename_msedge = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_msedge.mp3'\n",
|
|
"audio_filename_openai = sentence.rstrip(\",.?!\").translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\")\n",
|
|
"audio_filename_msedge = sentence.rstrip(\",.?!\").translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\")\n",
|
|
"# get_openai_tts_audio(sentence, audio_filename_openai, performer='alloy')\n",
|
|
"# await generate_edge_tts_audio(sentence, audio_filename_msedge, voice=\"en-US-GuyNeural\", verbose=True, overwrite=True, play=True)\n",
|
|
"for voice in [\"alloy\", \"nova\"]:\n",
|
|
" get_openai_tts_audio(sentence, f'../audios/{audio_filename_openai}-{voice}.mp3', performer=voice)\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 54,
|
|
"id": "7f219eb1",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"from openai import OpenAI\n",
|
|
"import os\n",
|
|
"import IPython\n",
|
|
"from datetime import datetime\n",
|
|
"from mutagen.mp3 import MP3\n",
|
|
"from mutagen.id3 import ID3, APIC, TPE1, TALB, TCON\n",
|
|
"from dotenv import load_dotenv\n",
|
|
"from pydub import AudioSegment\n",
|
|
"\n",
|
|
"load_dotenv()\n",
|
|
"client = OpenAI(\n",
|
|
")\n",
|
|
"\n",
|
|
"def get_openai_tts_audio(text, filename, performer=\"alloy\"):\n",
|
|
"\n",
|
|
" # check artwork.png and ending.mp3 files exist\n",
|
|
" if not os.path.isfile('Artwork.png') or not os.path.isfile('ending.mp3'):\n",
|
|
" print(\"Either Artwork.png or ending.mp3 file not found.\")\n",
|
|
" return\n",
|
|
"\n",
|
|
" # split the text into lines\n",
|
|
" text = markdown_to_text(text).split(\"\\n\")\n",
|
|
" # remove empty lines\n",
|
|
" text = [t for t in text if t]\n",
|
|
"\n",
|
|
" for t in text:\n",
|
|
" speech_file_path = f'temp-{text.index(t)}.mp3'\n",
|
|
" rspd_audio = client.audio.speech.create(\n",
|
|
" model=\"tts-1\",\n",
|
|
" voice=performer,\n",
|
|
" input=t.strip()\n",
|
|
" ) \n",
|
|
" rspd_audio.stream_to_file(speech_file_path)\n",
|
|
" # output a progress percentage \n",
|
|
" # keep updating within a line\n",
|
|
" print(f\"\\rprocessing: {round((text.index(t)+1)/len(text)*100)}%\", end='...')\n",
|
|
" print(\"\\n\")\n",
|
|
"\n",
|
|
" # create an audio of 1 second of silence\n",
|
|
" temp_audio = AudioSegment.silent(duration=1000)\n",
|
|
" for t in text:\n",
|
|
" seg = AudioSegment.from_file(f'temp-{text.index(t)}.mp3')\n",
|
|
" temp_audio += seg + AudioSegment.silent(duration=1500)\n",
|
|
" # delete the temp file\n",
|
|
" os.remove(f'temp-{text.index(t)}.mp3')\n",
|
|
" temp_audio.export('~temp.mp3', format='mp3')\n",
|
|
" speech = AudioSegment.from_file('~temp.mp3')\n",
|
|
" ending = AudioSegment.from_file('ending.mp3')\n",
|
|
" combined = speech + ending\n",
|
|
" os.remove('~temp.mp3')\n",
|
|
" if filename:\n",
|
|
" # if filename has no extension, add .mp3\n",
|
|
" if filename.endswith('.mp3'):\n",
|
|
" speech_file_path = filename\n",
|
|
" else:\n",
|
|
" speech_file_path = f'{filename}.mp3' \n",
|
|
" else:\n",
|
|
" speech_file_path = f'{datetime.now().strftime(\"%Y%m%d_%H%M%S\")}_{performer}.mp3'\n",
|
|
" combined.export(speech_file_path, format='mp3')\n",
|
|
" print(f\"Audio file saved as {speech_file_path}\")\n",
|
|
"\n",
|
|
" image_file = 'Artwork.png'\n",
|
|
" artist = 'tts'\n",
|
|
" album = 'Daily Speech Training'\n",
|
|
" genre = 'SPEECH'\n",
|
|
"\n",
|
|
" add_metadata(speech_file_path, image_file, artist, album, genre)\n",
|
|
" IPython.display.Audio(speech_file_path)\n",
|
|
"\n",
|
|
" return f'{speech_file_path} created successfully.'\n",
|
|
"\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "2df59a42",
|
|
"metadata": {},
|
|
"source": [
|
|
"# English Voices\n",
|
|
"\n",
|
|
"* voice = \"en-US-GuyNeural\" (Male)\n",
|
|
"* voice = \"en-US-AnaNeural\" (Female)\n",
|
|
"* voice = \"en-US-AndrewNeural\" (Male)\n",
|
|
"* voice = \"en-US-AriaNeural\" (Female)\n",
|
|
"* voice = \"en-US-AvaNeural\" (Female)\n",
|
|
"* voice = \"en-US-BrianNeural\" (Male)\n",
|
|
"* voice = \"en-US-ChristopherNeural\" (Male)\n",
|
|
"* voice = \"en-US-EmmaNeural\" (Female)\n",
|
|
"* voice = \"en-US-EricNeural\" (Male)\n",
|
|
"* voice = \"en-US-GuyNeural\" (Male)\n",
|
|
"* voice = \"en-US-JennyNeural\" (Female)\n",
|
|
"* voice = \"en-US-MichelleNeural\" (Female)\n",
|
|
"* voice = \"en-US-RogerNeural\" (Male)\n",
|
|
"* voice = \"en-US-SteffanNeural\" (Male)\n",
|
|
"* voice = \"en-GB-LibbyNeural\" (Female)\n",
|
|
"* voice = \"en-GB-MaisieNeural\" (Female)\n",
|
|
"* voice = \"en-GB-RyanNeural\" (Male)\n",
|
|
"* voice = \"en-GB-SoniaNeural\" (Female)\n",
|
|
"* voice = \"en-GB-ThomasNeural\" (Male)\n",
|
|
"* voice = \"en-AU-NatashaNeural\" (Female)\n",
|
|
"* voice = \"en-AU-WilliamNeural\" (Male)\n",
|
|
"* voice = \"en-CA-ClaraNeural\" (Female)\n",
|
|
"* voice = \"en-CA-LiamNeural\" (Male)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 79,
|
|
"id": "215d423d",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"<_io.BufferedRandom name='../audios/The-art-of-focus-in-our-whirlwind-existence-can-sometimes-feel-like-searching-for-a-needle-in-a-haystack-all-strong.mp3'>"
|
|
]
|
|
},
|
|
"execution_count": 79,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"\n",
|
|
"text = \"\"\"\n",
|
|
"The art of focus in our whirlwind existence can sometimes feel like searching for a needle in a haystack\n",
|
|
"\"\"\"\n",
|
|
"\n",
|
|
"# 1 second silence with pydub\n",
|
|
"from pydub import AudioSegment\n",
|
|
"sentence = AudioSegment.silent(duration=1000)\n",
|
|
"\n",
|
|
"for word in text.strip().split(' '):\n",
|
|
" w = word.strip().lower()\n",
|
|
" if w == \"a\":\n",
|
|
" w = \"uh\"\n",
|
|
" if len(w) > 0:\n",
|
|
" filename = f'../audios/temp-{w.replace(\" \", \"-\")}-{regions[i]}-{genders[i]}.mp3'\n",
|
|
" get_openai_tts_audio(w, filename, performer=\"alloy\")\n",
|
|
" sentence += AudioSegment.from_file(filename) + AudioSegment.silent(duration=200)\n",
|
|
" # remove the temp file\n",
|
|
" os.remove(filename)\n",
|
|
"sentence += AudioSegment.silent(duration=1000)\n",
|
|
"# save the sentence as a single audio file\n",
|
|
"sentence.export(f'../audios/{text.strip().replace(\" \",\"-\")}-all-strong.mp3', format='mp3')\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 77,
|
|
"id": "5a718cf9",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"text = \"\"\"\n",
|
|
"The art of focus in our whirlwind existence can sometimes feel like searching for a needle in a haystack\n",
|
|
"\"\"\"\n",
|
|
"filename = f'../audios/{text.strip().replace(\" \",\"-\")}-natural.mp3'\n",
|
|
"get_openai_tts_audio(text, filename, performer=\"alloy\")"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|