everyone-can-use-english/1000-hours/public/jupyter-notebooks/edge-tts-valcab-pronounciation.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b94f103d-ac43-4d13-83cb-eb5090220881",
   "metadata": {},
   "source": [
    "# EdgeTTS\n",
    "\n",
    "https://github.com/rany2/edge-tts\n",
    "\n",
    "edge-tts is a Python module that allows you to use Microsoft Edge's online text-to-speech service from within your Python code or using the provided edge-tts or edge-playback command."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "77deb08f-fec3-4327-b2f9-1c893aacaddc",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    }
   },
   "outputs": [],
   "source": [
    "%pip install edge-tts pygame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "id": "71d35cd9",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['en-US-GuyNeural', 'en-US-AriaNeural']\n",
      "hello\n",
      "../audios/hello-us-male.mp3 created\n",
      "../audios/hello-us-female.mp3 created\n",
      "\n",
      "heat\n",
      "../audios/heat-us-male.mp3 created\n",
      "../audios/heat-us-female.mp3 created\n",
      "\n",
      "high\n",
      "../audios/high-us-male.mp3 created\n",
      "../audios/high-us-female.mp3 created\n",
      "\n"
     ]
    }
   ],
   "source": [
    "import edge_tts\n",
    "import os\n",
    "import pygame\n",
    "import time\n",
    "\n",
    "async def generate_edge_tts_audio(text, file_name, voice='en-US-GuyNeural', style='newscast-formal', verbose=False, play=False, overwrite=False):\n",
    "    communicate = edge_tts.Communicate(text, voice)\n",
    "    # whether file exists?\n",
    "    if os.path.exists(file_name):\n",
    "        if overwrite:\n",
    "            if verbose:\n",
    "                print(f'{file_name} exists, overwriting...')\n",
    "        else:\n",
    "            if verbose:\n",
    "                print(f'{file_name} exists, skipping...')\n",
    "            return\n",
    "        \n",
    "    await communicate.save(file_name)\n",
    "    if play:\n",
    "        pygame.mixer.init()\n",
    "        pygame.mixer.music.load(file_name)\n",
    "        pygame.mixer.music.play()\n",
    "    if verbose:\n",
    "        print(f'{file_name} created')\n",
    "    \n",
    "    time.sleep(1.5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "id": "4146f92e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['en-US-GuyNeural', 'en-US-AriaNeural']\n",
      "important\n",
      "../audios/important-us-male.mp3 created\n",
      "../audios/important-us-female.mp3 created\n"
     ]
    }
   ],
   "source": [
    "\n",
    "voices = [\"en-US-GuyNeural\", \"en-US-AriaNeural\", \"en-GB-RyanNeural\", \"en-GB-LibbyNeural\"]\n",
    "regions = ['us', 'us', 'uk', 'uk']\n",
    "genders = ['male', 'female', 'male', 'female']\n",
    "\n",
    "# only_us = False\n",
    "only_us = True\n",
    "if only_us:\n",
    "    voices = voices[:2]\n",
    "    print(voices)\n",
    "\n",
    "words = \"\"\"\n",
    "important\n",
    "\"\"\"\n",
    "\n",
    "for word in words.strip().split(','):\n",
    "    print(word)\n",
    "    for i, voice in enumerate(voices):\n",
    "        w = word.strip().lower()\n",
    "        if len(w) > 0:\n",
    "            filename = f'../audios/{w.replace(\" \", \"-\")}-{regions[i]}-{genders[i]}.mp3'\n",
    "            await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "id": "2d46cde4",
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_openai_tts_audio(text, path, performer='alloy'):\n",
    "        \n",
    "        from openai import OpenAI\n",
    "        from dotenv import load_dotenv\n",
    "        load_dotenv()\n",
    "        client = OpenAI(\n",
    "        )\n",
    "        \n",
    "        with client.audio.speech.with_streaming_response.create(\n",
    "            model=\"tts-1\",\n",
    "            voice=performer,\n",
    "            input=text.strip()\n",
    "        ) as response:\n",
    "            response.stream_to_file(path)\n",
    "        \n",
    "sentence = \"It's a very important aspect\"\n",
    "\n",
    "# remove all punctuation at the end of sentence,\n",
    "# replace all spaces and punctuations in the sentence with dash\n",
    "# audio_filename_openai = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_openai.mp3'\n",
    "# audio_filename_msedge = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_msedge.mp3'\n",
    "audio_filename_openai = sentence.rstrip(\",.?!\").translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\")\n",
    "audio_filename_msedge = sentence.rstrip(\",.?!\").translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\")\n",
    "# get_openai_tts_audio(sentence, audio_filename_openai, performer='alloy')\n",
    "# await generate_edge_tts_audio(sentence, audio_filename_msedge, voice=\"en-US-GuyNeural\", verbose=True, overwrite=True, play=True)\n",
    "for voice in [\"alloy\", \"nova\"]:\n",
    "    get_openai_tts_audio(sentence, f'../audios/{audio_filename_openai}-{voice}.mp3', performer=voice)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "id": "7f219eb1",
   "metadata": {},
   "outputs": [],
   "source": [
    "from openai import OpenAI\n",
    "import os\n",
    "import IPython\n",
    "from datetime import datetime\n",
    "from mutagen.mp3 import MP3\n",
    "from mutagen.id3 import ID3, APIC, TPE1, TALB, TCON\n",
    "from dotenv import load_dotenv\n",
    "from pydub import AudioSegment\n",
    "\n",
    "load_dotenv()\n",
    "client = OpenAI(\n",
    ")\n",
    "\n",
    "def get_openai_tts_audio(text, filename, performer=\"alloy\"):\n",
    "\n",
    "    # check artwork.png and ending.mp3 files exist\n",
    "    if not os.path.isfile('Artwork.png') or not os.path.isfile('ending.mp3'):\n",
    "        print(\"Either Artwork.png or ending.mp3 file not found.\")\n",
    "        return\n",
    "\n",
    "    # split the text into lines\n",
    "    text = markdown_to_text(text).split(\"\\n\")\n",
    "    # remove empty lines\n",
    "    text = [t for t in text if t]\n",
    "\n",
    "    for t in text:\n",
    "        speech_file_path = f'temp-{text.index(t)}.mp3'\n",
    "        rspd_audio = client.audio.speech.create(\n",
    "            model=\"tts-1\",\n",
    "            voice=performer,\n",
    "            input=t.strip()\n",
    "        ) \n",
    "        rspd_audio.stream_to_file(speech_file_path)\n",
    "        # output a progress percentage \n",
    "        # keep updating within a line\n",
    "        print(f\"\\rprocessing: {round((text.index(t)+1)/len(text)*100)}%\", end='...')\n",
    "    print(\"\\n\")\n",
    "\n",
    "    # create an audio of 1 second of silence\n",
    "    temp_audio = AudioSegment.silent(duration=1000)\n",
    "    for t in text:\n",
    "        seg = AudioSegment.from_file(f'temp-{text.index(t)}.mp3')\n",
    "        temp_audio += seg + AudioSegment.silent(duration=1500)\n",
    "        # delete the temp file\n",
    "        os.remove(f'temp-{text.index(t)}.mp3')\n",
    "    temp_audio.export('~temp.mp3', format='mp3')\n",
    "    speech = AudioSegment.from_file('~temp.mp3')\n",
    "    ending = AudioSegment.from_file('ending.mp3')\n",
    "    combined = speech + ending\n",
    "    os.remove('~temp.mp3')\n",
    "    if filename:\n",
    "        # if filename has no extension, add .mp3\n",
    "        if filename.endswith('.mp3'):\n",
    "            speech_file_path = filename\n",
    "        else:\n",
    "            speech_file_path = f'{filename}.mp3'        \n",
    "    else:\n",
    "        speech_file_path = f'{datetime.now().strftime(\"%Y%m%d_%H%M%S\")}_{performer}.mp3'\n",
    "    combined.export(speech_file_path, format='mp3')\n",
    "    print(f\"Audio file saved as {speech_file_path}\")\n",
    "\n",
    "    image_file = 'Artwork.png'\n",
    "    artist = 'tts'\n",
    "    album = 'Daily Speech Training'\n",
    "    genre = 'SPEECH'\n",
    "\n",
    "    add_metadata(speech_file_path, image_file, artist, album, genre)\n",
    "    IPython.display.Audio(speech_file_path)\n",
    "\n",
    "    return f'{speech_file_path} created successfully.'\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2df59a42",
   "metadata": {},
   "source": [
    "# English Voices\n",
    "\n",
    "* voice = \"en-US-GuyNeural\" (Male)\n",
    "* voice = \"en-US-AnaNeural\" (Female)\n",
    "* voice = \"en-US-AndrewNeural\" (Male)\n",
    "* voice = \"en-US-AriaNeural\" (Female)\n",
    "* voice = \"en-US-AvaNeural\" (Female)\n",
    "* voice = \"en-US-BrianNeural\" (Male)\n",
    "* voice = \"en-US-ChristopherNeural\" (Male)\n",
    "* voice = \"en-US-EmmaNeural\" (Female)\n",
    "* voice = \"en-US-EricNeural\" (Male)\n",
    "* voice = \"en-US-GuyNeural\" (Male)\n",
    "* voice = \"en-US-JennyNeural\" (Female)\n",
    "* voice = \"en-US-MichelleNeural\" (Female)\n",
    "* voice = \"en-US-RogerNeural\" (Male)\n",
    "* voice = \"en-US-SteffanNeural\" (Male)\n",
    "* voice = \"en-GB-LibbyNeural\" (Female)\n",
    "* voice = \"en-GB-MaisieNeural\" (Female)\n",
    "* voice = \"en-GB-RyanNeural\" (Male)\n",
    "* voice = \"en-GB-SoniaNeural\" (Female)\n",
    "* voice = \"en-GB-ThomasNeural\" (Male)\n",
    "* voice = \"en-AU-NatashaNeural\" (Female)\n",
    "* voice = \"en-AU-WilliamNeural\" (Male)\n",
    "* voice = \"en-CA-ClaraNeural\" (Female)\n",
    "* voice = \"en-CA-LiamNeural\" (Male)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 79,
   "id": "215d423d",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<_io.BufferedRandom name='../audios/The-art-of-focus-in-our-whirlwind-existence-can-sometimes-feel-like-searching-for-a-needle-in-a-haystack-all-strong.mp3'>"
      ]
     },
     "execution_count": 79,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "\n",
    "text = \"\"\"\n",
    "The art of focus in our whirlwind existence can sometimes feel like searching for a needle in a haystack\n",
    "\"\"\"\n",
    "\n",
    "# 1 second silence with pydub\n",
    "from pydub import AudioSegment\n",
    "sentence = AudioSegment.silent(duration=1000)\n",
    "\n",
    "for word in text.strip().split(' '):\n",
    "    w = word.strip().lower()\n",
    "    if w == \"a\":\n",
    "        w = \"uh\"\n",
    "    if len(w) > 0:\n",
    "        filename = f'../audios/temp-{w.replace(\" \", \"-\")}-{regions[i]}-{genders[i]}.mp3'\n",
    "        get_openai_tts_audio(w, filename, performer=\"alloy\")\n",
    "        sentence += AudioSegment.from_file(filename) + AudioSegment.silent(duration=200)\n",
    "        # remove the temp file\n",
    "        os.remove(filename)\n",
    "sentence += AudioSegment.silent(duration=1000)\n",
    "# save the sentence as a single audio file\n",
    "sentence.export(f'../audios/{text.strip().replace(\" \",\"-\")}-all-strong.mp3', format='mp3')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "id": "5a718cf9",
   "metadata": {},
   "outputs": [],
   "source": [
    "text = \"\"\"\n",
    "The art of focus in our whirlwind existence can sometimes feel like searching for a needle in a haystack\n",
    "\"\"\"\n",
    "filename = f'../audios/{text.strip().replace(\" \",\"-\")}-natural.mp3'\n",
    "get_openai_tts_audio(text, filename, performer=\"alloy\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}