08-20-0951, chapter 4

This commit is contained in:
xiaolai
2024-08-20 09:51:14 +08:00
parent c575d0ba0f
commit d7808b3785
9 changed files with 162 additions and 87 deletions

View File

@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 49,
"id": "71d35cd9",
"metadata": {},
"outputs": [
@@ -38,81 +38,17 @@
"output_type": "stream",
"text": [
"['en-US-GuyNeural', 'en-US-AriaNeural']\n",
"ma\n",
"../audios/ma-us-male.mp3 created\n",
"../audios/ma-us-female.mp3 created\n",
"hello\n",
"../audios/hello-us-male.mp3 created\n",
"../audios/hello-us-female.mp3 created\n",
"\n",
"room\n",
"../audios/room-us-male.mp3 created\n",
"../audios/room-us-female.mp3 created\n",
"heat\n",
"../audios/heat-us-male.mp3 created\n",
"../audios/heat-us-female.mp3 created\n",
"\n",
"conversation\n",
"../audios/conversation-us-male.mp3 created\n",
"../audios/conversation-us-female.mp3 created\n",
"\n",
"army\n",
"../audios/army-us-male.mp3 created\n",
"../audios/army-us-female.mp3 created\n",
"\n",
"mob\n",
"../audios/mob-us-male.mp3 created\n",
"../audios/mob-us-female.mp3 created\n",
"\n",
"mom\n",
"../audios/mom-us-male.mp3 created\n",
"../audios/mom-us-female.mp3 created\n",
"\n",
"mind\n",
"../audios/mind-us-male.mp3 created\n",
"../audios/mind-us-female.mp3 created\n",
"\n",
"night\n",
"../audios/night-us-male.mp3 created\n",
"../audios/night-us-female.mp3 created\n",
"\n",
"nine\n",
"../audios/nine-us-male.mp3 created\n",
"../audios/nine-us-female.mp3 created\n",
"\n",
"know\n",
"../audios/know-us-male.mp3 created\n",
"../audios/know-us-female.mp3 created\n",
"\n",
"knight\n",
"../audios/knight-us-male.mp3 created\n",
"../audios/knight-us-female.mp3 created\n",
"\n",
"gnaw\n",
"../audios/gnaw-us-male.mp3 created\n",
"../audios/gnaw-us-female.mp3 created\n",
"\n",
"gnome\n",
"../audios/gnome-us-male.mp3 created\n",
"../audios/gnome-us-female.mp3 created\n",
"\n",
"anchor\n",
"../audios/anchor-us-male.mp3 created\n",
"../audios/anchor-us-female.mp3 created\n",
"\n",
"bank\n",
"../audios/bank-us-male.mp3 created\n",
"../audios/bank-us-female.mp3 created\n",
"\n",
"thank\n",
"../audios/thank-us-male.mp3 exists, skipping...\n",
"../audios/thank-us-female.mp3 exists, skipping...\n",
"\n",
"bang\n",
"../audios/bang-us-male.mp3 created\n",
"../audios/bang-us-female.mp3 created\n",
"\n",
"long\n",
"../audios/long-us-male.mp3 created\n",
"../audios/long-us-female.mp3 created\n",
"\n",
"sing\n",
"../audios/sing-us-male.mp3 created\n",
"../audios/sing-us-female.mp3 created\n",
"high\n",
"../audios/high-us-male.mp3 created\n",
"../audios/high-us-female.mp3 created\n",
"\n"
]
}
@@ -143,9 +79,16 @@
" if verbose:\n",
" print(f'{file_name} created')\n",
" \n",
" time.sleep(1.5)\n",
"\n",
"\n",
" time.sleep(1.5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4146f92e",
"metadata": {},
"outputs": [],
"source": [
"\n",
"voices = [\"en-US-GuyNeural\", \"en-US-AriaNeural\", \"en-GB-RyanNeural\", \"en-GB-LibbyNeural\"]\n",
"regions = ['us', 'us', 'uk', 'uk']\n",
@@ -158,7 +101,9 @@
" print(voices)\n",
"\n",
"words = \"\"\"\n",
"sam\n",
"hello,\n",
"heat,\n",
"high,\n",
"\"\"\"\n",
"\n",
"for word in words.strip().split(','):\n",
@@ -170,6 +115,121 @@
" await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "2d46cde4",
"metadata": {},
"outputs": [],
"source": [
"def get_openai_tts_audio(text, path, performer='alloy'):\n",
" \n",
" from openai import OpenAI\n",
" from dotenv import load_dotenv\n",
" load_dotenv()\n",
" client = OpenAI(\n",
" )\n",
" \n",
" with client.audio.speech.with_streaming_response.create(\n",
" model=\"tts-1\",\n",
" voice=performer,\n",
" input=text.strip()\n",
" ) as response:\n",
" response.stream_to_file(path)\n",
" \n",
"sentence = \"The explanation you gave was clear but I need a more detailed explanation.\"\n",
"\n",
"# remove all punctuation at the end of sentence,\n",
"# replace all spaces and punctuations in the sentence with dash\n",
"audio_filename_openai = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_openai.mp3'\n",
"audio_filename_msedge = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_msedge.mp3'\n",
"# get_openai_tts_audio(sentence, audio_filename_openai, performer='alloy')\n",
"# await generate_edge_tts_audio(sentence, audio_filename_msedge, voice=\"en-US-GuyNeural\", verbose=True, overwrite=True, play=True)\n",
"\n",
"for voice in [\"alloy\", \"nova\"]:\n",
" get_openai_tts_audio(sentence, f'../audios/{sentence.replace(\" \", \"-\")}-{voice}.mp3', performer=voice)\n"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "7f219eb1",
"metadata": {},
"outputs": [],
"source": [
"from openai import OpenAI\n",
"import os\n",
"import IPython\n",
"from datetime import datetime\n",
"from mutagen.mp3 import MP3\n",
"from mutagen.id3 import ID3, APIC, TPE1, TALB, TCON\n",
"from dotenv import load_dotenv\n",
"from pydub import AudioSegment\n",
"\n",
"load_dotenv()\n",
"client = OpenAI(\n",
")\n",
"\n",
"def get_openai_tts_audio(text, filename, performer=\"alloy\"):\n",
"\n",
" # check artwork.png and ending.mp3 files exist\n",
" if not os.path.isfile('Artwork.png') or not os.path.isfile('ending.mp3'):\n",
" print(\"Either Artwork.png or ending.mp3 file not found.\")\n",
" return\n",
"\n",
" # split the text into lines\n",
" text = markdown_to_text(text).split(\"\\n\")\n",
" # remove empty lines\n",
" text = [t for t in text if t]\n",
"\n",
" for t in text:\n",
" speech_file_path = f'temp-{text.index(t)}.mp3'\n",
" rspd_audio = client.audio.speech.create(\n",
" model=\"tts-1\",\n",
" voice=performer,\n",
" input=t.strip()\n",
" ) \n",
" rspd_audio.stream_to_file(speech_file_path)\n",
" # output a progress percentage \n",
" # keep updating within a line\n",
" print(f\"\\rprocessing: {round((text.index(t)+1)/len(text)*100)}%\", end='...')\n",
" print(\"\\n\")\n",
"\n",
" # create an audio of 1 second of silence\n",
" temp_audio = AudioSegment.silent(duration=1000)\n",
" for t in text:\n",
" seg = AudioSegment.from_file(f'temp-{text.index(t)}.mp3')\n",
" temp_audio += seg + AudioSegment.silent(duration=1500)\n",
" # delete the temp file\n",
" os.remove(f'temp-{text.index(t)}.mp3')\n",
" temp_audio.export('~temp.mp3', format='mp3')\n",
" speech = AudioSegment.from_file('~temp.mp3')\n",
" ending = AudioSegment.from_file('ending.mp3')\n",
" combined = speech + ending\n",
" os.remove('~temp.mp3')\n",
" if filename:\n",
" # if filename has no extension, add .mp3\n",
" if filename.endswith('.mp3'):\n",
" speech_file_path = filename\n",
" else:\n",
" speech_file_path = f'{filename}.mp3' \n",
" else:\n",
" speech_file_path = f'{datetime.now().strftime(\"%Y%m%d_%H%M%S\")}_{performer}.mp3'\n",
" combined.export(speech_file_path, format='mp3')\n",
" print(f\"Audio file saved as {speech_file_path}\")\n",
"\n",
" image_file = 'Artwork.png'\n",
" artist = 'tts'\n",
" album = 'Daily Speech Training'\n",
" genre = 'SPEECH'\n",
"\n",
" add_metadata(speech_file_path, image_file, artist, album, genre)\n",
" IPython.display.Audio(speech_file_path)\n",
"\n",
" return f'{speech_file_path} created successfully.'\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "2df59a42",