08-20-0951, chapter 4

2024-08-20 09:51:14 +08:00
parent c575d0ba0f
commit d7808b3785
9 changed files with 162 additions and 87 deletions
--- a/1000-hours/public/jupyter-notebooks/edge-tts-valcab-pronounciation.ipynb
+++ b/1000-hours/public/jupyter-notebooks/edge-tts-valcab-pronounciation.ipynb
@@ -29,7 +29,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 42,
+   "execution_count": 49,
   "id": "71d35cd9",
   "metadata": {},
   "outputs": [
@@ -38,81 +38,17 @@
     "output_type": "stream",
     "text": [
      "['en-US-GuyNeural', 'en-US-AriaNeural']\n",
-      "ma\n",
-      "../audios/ma-us-male.mp3 created\n",
-      "../audios/ma-us-female.mp3 created\n",
+      "hello\n",
+      "../audios/hello-us-male.mp3 created\n",
+      "../audios/hello-us-female.mp3 created\n",
      "\n",
-      "room\n",
-      "../audios/room-us-male.mp3 created\n",
-      "../audios/room-us-female.mp3 created\n",
+      "heat\n",
+      "../audios/heat-us-male.mp3 created\n",
+      "../audios/heat-us-female.mp3 created\n",
      "\n",
-      "conversation\n",
-      "../audios/conversation-us-male.mp3 created\n",
-      "../audios/conversation-us-female.mp3 created\n",
-      "\n",
-      "army\n",
-      "../audios/army-us-male.mp3 created\n",
-      "../audios/army-us-female.mp3 created\n",
-      "\n",
-      "mob\n",
-      "../audios/mob-us-male.mp3 created\n",
-      "../audios/mob-us-female.mp3 created\n",
-      "\n",
-      "mom\n",
-      "../audios/mom-us-male.mp3 created\n",
-      "../audios/mom-us-female.mp3 created\n",
-      "\n",
-      "mind\n",
-      "../audios/mind-us-male.mp3 created\n",
-      "../audios/mind-us-female.mp3 created\n",
-      "\n",
-      "night\n",
-      "../audios/night-us-male.mp3 created\n",
-      "../audios/night-us-female.mp3 created\n",
-      "\n",
-      "nine\n",
-      "../audios/nine-us-male.mp3 created\n",
-      "../audios/nine-us-female.mp3 created\n",
-      "\n",
-      "know\n",
-      "../audios/know-us-male.mp3 created\n",
-      "../audios/know-us-female.mp3 created\n",
-      "\n",
-      "knight\n",
-      "../audios/knight-us-male.mp3 created\n",
-      "../audios/knight-us-female.mp3 created\n",
-      "\n",
-      "gnaw\n",
-      "../audios/gnaw-us-male.mp3 created\n",
-      "../audios/gnaw-us-female.mp3 created\n",
-      "\n",
-      "gnome\n",
-      "../audios/gnome-us-male.mp3 created\n",
-      "../audios/gnome-us-female.mp3 created\n",
-      "\n",
-      "anchor\n",
-      "../audios/anchor-us-male.mp3 created\n",
-      "../audios/anchor-us-female.mp3 created\n",
-      "\n",
-      "bank\n",
-      "../audios/bank-us-male.mp3 created\n",
-      "../audios/bank-us-female.mp3 created\n",
-      "\n",
-      "thank\n",
-      "../audios/thank-us-male.mp3 exists, skipping...\n",
-      "../audios/thank-us-female.mp3 exists, skipping...\n",
-      "\n",
-      "bang\n",
-      "../audios/bang-us-male.mp3 created\n",
-      "../audios/bang-us-female.mp3 created\n",
-      "\n",
-      "long\n",
-      "../audios/long-us-male.mp3 created\n",
-      "../audios/long-us-female.mp3 created\n",
-      "\n",
-      "sing\n",
-      "../audios/sing-us-male.mp3 created\n",
-      "../audios/sing-us-female.mp3 created\n",
+      "high\n",
+      "../audios/high-us-male.mp3 created\n",
+      "../audios/high-us-female.mp3 created\n",
      "\n"
     ]
    }
@@ -143,9 +79,16 @@
    "    if verbose:\n",
    "        print(f'{file_name} created')\n",
    "    \n",
-    "    time.sleep(1.5)\n",
-    "\n",
-    "\n",
+    "    time.sleep(1.5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4146f92e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
    "\n",
    "voices = [\"en-US-GuyNeural\", \"en-US-AriaNeural\", \"en-GB-RyanNeural\", \"en-GB-LibbyNeural\"]\n",
    "regions = ['us', 'us', 'uk', 'uk']\n",
@@ -158,7 +101,9 @@
    "    print(voices)\n",
    "\n",
    "words = \"\"\"\n",
-    "sam\n",
+    "hello,\n",
+    "heat,\n",
+    "high,\n",
    "\"\"\"\n",
    "\n",
    "for word in words.strip().split(','):\n",
@@ -170,6 +115,121 @@
    "            await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)\n"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "id": "2d46cde4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_openai_tts_audio(text, path, performer='alloy'):\n",
+    "        \n",
+    "        from openai import OpenAI\n",
+    "        from dotenv import load_dotenv\n",
+    "        load_dotenv()\n",
+    "        client = OpenAI(\n",
+    "        )\n",
+    "        \n",
+    "        with client.audio.speech.with_streaming_response.create(\n",
+    "            model=\"tts-1\",\n",
+    "            voice=performer,\n",
+    "            input=text.strip()\n",
+    "        ) as response:\n",
+    "            response.stream_to_file(path)\n",
+    "        \n",
+    "sentence = \"The explanation you gave was clear but I need a more detailed explanation.\"\n",
+    "\n",
+    "# remove all punctuation at the end of sentence,\n",
+    "# replace all spaces and punctuations in the sentence with dash\n",
+    "audio_filename_openai = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_openai.mp3'\n",
+    "audio_filename_msedge = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_msedge.mp3'\n",
+    "# get_openai_tts_audio(sentence, audio_filename_openai, performer='alloy')\n",
+    "# await generate_edge_tts_audio(sentence, audio_filename_msedge, voice=\"en-US-GuyNeural\", verbose=True, overwrite=True, play=True)\n",
+    "\n",
+    "for voice in [\"alloy\", \"nova\"]:\n",
+    "    get_openai_tts_audio(sentence, f'../audios/{sentence.replace(\" \", \"-\")}-{voice}.mp3', performer=voice)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "7f219eb1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from openai import OpenAI\n",
+    "import os\n",
+    "import IPython\n",
+    "from datetime import datetime\n",
+    "from mutagen.mp3 import MP3\n",
+    "from mutagen.id3 import ID3, APIC, TPE1, TALB, TCON\n",
+    "from dotenv import load_dotenv\n",
+    "from pydub import AudioSegment\n",
+    "\n",
+    "load_dotenv()\n",
+    "client = OpenAI(\n",
+    ")\n",
+    "\n",
+    "def get_openai_tts_audio(text, filename, performer=\"alloy\"):\n",
+    "\n",
+    "    # check artwork.png and ending.mp3 files exist\n",
+    "    if not os.path.isfile('Artwork.png') or not os.path.isfile('ending.mp3'):\n",
+    "        print(\"Either Artwork.png or ending.mp3 file not found.\")\n",
+    "        return\n",
+    "\n",
+    "    # split the text into lines\n",
+    "    text = markdown_to_text(text).split(\"\\n\")\n",
+    "    # remove empty lines\n",
+    "    text = [t for t in text if t]\n",
+    "\n",
+    "    for t in text:\n",
+    "        speech_file_path = f'temp-{text.index(t)}.mp3'\n",
+    "        rspd_audio = client.audio.speech.create(\n",
+    "            model=\"tts-1\",\n",
+    "            voice=performer,\n",
+    "            input=t.strip()\n",
+    "        ) \n",
+    "        rspd_audio.stream_to_file(speech_file_path)\n",
+    "        # output a progress percentage \n",
+    "        # keep updating within a line\n",
+    "        print(f\"\\rprocessing: {round((text.index(t)+1)/len(text)*100)}%\", end='...')\n",
+    "    print(\"\\n\")\n",
+    "\n",
+    "    # create an audio of 1 second of silence\n",
+    "    temp_audio = AudioSegment.silent(duration=1000)\n",
+    "    for t in text:\n",
+    "        seg = AudioSegment.from_file(f'temp-{text.index(t)}.mp3')\n",
+    "        temp_audio += seg + AudioSegment.silent(duration=1500)\n",
+    "        # delete the temp file\n",
+    "        os.remove(f'temp-{text.index(t)}.mp3')\n",
+    "    temp_audio.export('~temp.mp3', format='mp3')\n",
+    "    speech = AudioSegment.from_file('~temp.mp3')\n",
+    "    ending = AudioSegment.from_file('ending.mp3')\n",
+    "    combined = speech + ending\n",
+    "    os.remove('~temp.mp3')\n",
+    "    if filename:\n",
+    "        # if filename has no extension, add .mp3\n",
+    "        if filename.endswith('.mp3'):\n",
+    "            speech_file_path = filename\n",
+    "        else:\n",
+    "            speech_file_path = f'{filename}.mp3'        \n",
+    "    else:\n",
+    "        speech_file_path = f'{datetime.now().strftime(\"%Y%m%d_%H%M%S\")}_{performer}.mp3'\n",
+    "    combined.export(speech_file_path, format='mp3')\n",
+    "    print(f\"Audio file saved as {speech_file_path}\")\n",
+    "\n",
+    "    image_file = 'Artwork.png'\n",
+    "    artist = 'tts'\n",
+    "    album = 'Daily Speech Training'\n",
+    "    genre = 'SPEECH'\n",
+    "\n",
+    "    add_metadata(speech_file_path, image_file, artist, album, genre)\n",
+    "    IPython.display.Audio(speech_file_path)\n",
+    "\n",
+    "    return f'{speech_file_path} created successfully.'\n",
+    "\n"
+   ]
+  },
  {
   "cell_type": "markdown",
   "id": "2df59a42",