diff --git a/1000-hours/public/audios/Her-communication-skills-are-excellent-but-her-communication-of-the-project-details-needs-work-alloy.mp3 b/1000-hours/public/audios/Her-communication-skills-are-excellent-but-her-communication-of-the-project-details-needs-work-alloy.mp3 new file mode 100644 index 00000000..ea801f02 Binary files /dev/null and b/1000-hours/public/audios/Her-communication-skills-are-excellent-but-her-communication-of-the-project-details-needs-work-alloy.mp3 differ diff --git a/1000-hours/public/audios/Her-communication-skills-are-excellent-but-her-communication-of-the-project-details-needs-work-nova.mp3 b/1000-hours/public/audios/Her-communication-skills-are-excellent-but-her-communication-of-the-project-details-needs-work-nova.mp3 new file mode 100644 index 00000000..531c05d8 Binary files /dev/null and b/1000-hours/public/audios/Her-communication-skills-are-excellent-but-her-communication-of-the-project-details-needs-work-nova.mp3 differ diff --git a/1000-hours/public/audios/The-explanation-you-gave-was-clear-but-I-need-a-more-detailed-explanation-alloy.mp3 b/1000-hours/public/audios/The-explanation-you-gave-was-clear-but-I-need-a-more-detailed-explanation-alloy.mp3 new file mode 100644 index 00000000..824cfc90 Binary files /dev/null and b/1000-hours/public/audios/The-explanation-you-gave-was-clear-but-I-need-a-more-detailed-explanation-alloy.mp3 differ diff --git a/1000-hours/public/audios/The-explanation-you-gave-was-clear-but-I-need-a-more-detailed-explanation-nova.mp3 b/1000-hours/public/audios/The-explanation-you-gave-was-clear-but-I-need-a-more-detailed-explanation-nova.mp3 new file mode 100644 index 00000000..9d549726 Binary files /dev/null and b/1000-hours/public/audios/The-explanation-you-gave-was-clear-but-I-need-a-more-detailed-explanation-nova.mp3 differ diff --git a/1000-hours/public/audios/Would-you-please-phrase-that-question-so-that-I-can-answer-it-with-yes-or-no_msedge.mp3 b/1000-hours/public/audios/Would-you-please-phrase-that-question-so-that-I-can-answer-it-with-yes-or-no_msedge.mp3 new file mode 100644 index 00000000..721cae77 Binary files /dev/null and b/1000-hours/public/audios/Would-you-please-phrase-that-question-so-that-I-can-answer-it-with-yes-or-no_msedge.mp3 differ diff --git a/1000-hours/public/audios/Would-you-please-phrase-that-question-so-that-I-can-answer-it-with-yes-or-no_openai.mp3 b/1000-hours/public/audios/Would-you-please-phrase-that-question-so-that-I-can-answer-it-with-yes-or-no_openai.mp3 new file mode 100644 index 00000000..ff7e33c6 Binary files /dev/null and b/1000-hours/public/audios/Would-you-please-phrase-that-question-so-that-I-can-answer-it-with-yes-or-no_openai.mp3 differ diff --git a/1000-hours/public/audios/segment-donald-sherman-ordered-a-pizza.mp3 b/1000-hours/public/audios/segment-donald-sherman-ordered-a-pizza.mp3 new file mode 100644 index 00000000..b8d79659 Binary files /dev/null and b/1000-hours/public/audios/segment-donald-sherman-ordered-a-pizza.mp3 differ diff --git a/1000-hours/public/jupyter-notebooks/edge-tts-valcab-pronounciation.ipynb b/1000-hours/public/jupyter-notebooks/edge-tts-valcab-pronounciation.ipynb index 45f49de2..5ca36b08 100644 --- a/1000-hours/public/jupyter-notebooks/edge-tts-valcab-pronounciation.ipynb +++ b/1000-hours/public/jupyter-notebooks/edge-tts-valcab-pronounciation.ipynb @@ -29,7 +29,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 49, "id": "71d35cd9", "metadata": {}, "outputs": [ @@ -38,81 +38,17 @@ "output_type": "stream", "text": [ "['en-US-GuyNeural', 'en-US-AriaNeural']\n", - "ma\n", - "../audios/ma-us-male.mp3 created\n", - "../audios/ma-us-female.mp3 created\n", + "hello\n", + "../audios/hello-us-male.mp3 created\n", + "../audios/hello-us-female.mp3 created\n", "\n", - "room\n", - "../audios/room-us-male.mp3 created\n", - "../audios/room-us-female.mp3 created\n", + "heat\n", + "../audios/heat-us-male.mp3 created\n", + "../audios/heat-us-female.mp3 created\n", "\n", - "conversation\n", - "../audios/conversation-us-male.mp3 created\n", - "../audios/conversation-us-female.mp3 created\n", - "\n", - "army\n", - "../audios/army-us-male.mp3 created\n", - "../audios/army-us-female.mp3 created\n", - "\n", - "mob\n", - "../audios/mob-us-male.mp3 created\n", - "../audios/mob-us-female.mp3 created\n", - "\n", - "mom\n", - "../audios/mom-us-male.mp3 created\n", - "../audios/mom-us-female.mp3 created\n", - "\n", - "mind\n", - "../audios/mind-us-male.mp3 created\n", - "../audios/mind-us-female.mp3 created\n", - "\n", - "night\n", - "../audios/night-us-male.mp3 created\n", - "../audios/night-us-female.mp3 created\n", - "\n", - "nine\n", - "../audios/nine-us-male.mp3 created\n", - "../audios/nine-us-female.mp3 created\n", - "\n", - "know\n", - "../audios/know-us-male.mp3 created\n", - "../audios/know-us-female.mp3 created\n", - "\n", - "knight\n", - "../audios/knight-us-male.mp3 created\n", - "../audios/knight-us-female.mp3 created\n", - "\n", - "gnaw\n", - "../audios/gnaw-us-male.mp3 created\n", - "../audios/gnaw-us-female.mp3 created\n", - "\n", - "gnome\n", - "../audios/gnome-us-male.mp3 created\n", - "../audios/gnome-us-female.mp3 created\n", - "\n", - "anchor\n", - "../audios/anchor-us-male.mp3 created\n", - "../audios/anchor-us-female.mp3 created\n", - "\n", - "bank\n", - "../audios/bank-us-male.mp3 created\n", - "../audios/bank-us-female.mp3 created\n", - "\n", - "thank\n", - "../audios/thank-us-male.mp3 exists, skipping...\n", - "../audios/thank-us-female.mp3 exists, skipping...\n", - "\n", - "bang\n", - "../audios/bang-us-male.mp3 created\n", - "../audios/bang-us-female.mp3 created\n", - "\n", - "long\n", - "../audios/long-us-male.mp3 created\n", - "../audios/long-us-female.mp3 created\n", - "\n", - "sing\n", - "../audios/sing-us-male.mp3 created\n", - "../audios/sing-us-female.mp3 created\n", + "high\n", + "../audios/high-us-male.mp3 created\n", + "../audios/high-us-female.mp3 created\n", "\n" ] } @@ -143,9 +79,16 @@ " if verbose:\n", " print(f'{file_name} created')\n", " \n", - " time.sleep(1.5)\n", - "\n", - "\n", + " time.sleep(1.5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4146f92e", + "metadata": {}, + "outputs": [], + "source": [ "\n", "voices = [\"en-US-GuyNeural\", \"en-US-AriaNeural\", \"en-GB-RyanNeural\", \"en-GB-LibbyNeural\"]\n", "regions = ['us', 'us', 'uk', 'uk']\n", @@ -158,7 +101,9 @@ " print(voices)\n", "\n", "words = \"\"\"\n", - "sam\n", + "hello,\n", + "heat,\n", + "high,\n", "\"\"\"\n", "\n", "for word in words.strip().split(','):\n", @@ -170,6 +115,121 @@ " await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)\n" ] }, + { + "cell_type": "code", + "execution_count": 63, + "id": "2d46cde4", + "metadata": {}, + "outputs": [], + "source": [ + "def get_openai_tts_audio(text, path, performer='alloy'):\n", + " \n", + " from openai import OpenAI\n", + " from dotenv import load_dotenv\n", + " load_dotenv()\n", + " client = OpenAI(\n", + " )\n", + " \n", + " with client.audio.speech.with_streaming_response.create(\n", + " model=\"tts-1\",\n", + " voice=performer,\n", + " input=text.strip()\n", + " ) as response:\n", + " response.stream_to_file(path)\n", + " \n", + "sentence = \"The explanation you gave was clear but I need a more detailed explanation.\"\n", + "\n", + "# remove all punctuation at the end of sentence,\n", + "# replace all spaces and punctuations in the sentence with dash\n", + "audio_filename_openai = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_openai.mp3'\n", + "audio_filename_msedge = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_msedge.mp3'\n", + "# get_openai_tts_audio(sentence, audio_filename_openai, performer='alloy')\n", + "# await generate_edge_tts_audio(sentence, audio_filename_msedge, voice=\"en-US-GuyNeural\", verbose=True, overwrite=True, play=True)\n", + "\n", + "for voice in [\"alloy\", \"nova\"]:\n", + " get_openai_tts_audio(sentence, f'../audios/{sentence.replace(\" \", \"-\")}-{voice}.mp3', performer=voice)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "id": "7f219eb1", + "metadata": {}, + "outputs": [], + "source": [ + "from openai import OpenAI\n", + "import os\n", + "import IPython\n", + "from datetime import datetime\n", + "from mutagen.mp3 import MP3\n", + "from mutagen.id3 import ID3, APIC, TPE1, TALB, TCON\n", + "from dotenv import load_dotenv\n", + "from pydub import AudioSegment\n", + "\n", + "load_dotenv()\n", + "client = OpenAI(\n", + ")\n", + "\n", + "def get_openai_tts_audio(text, filename, performer=\"alloy\"):\n", + "\n", + " # check artwork.png and ending.mp3 files exist\n", + " if not os.path.isfile('Artwork.png') or not os.path.isfile('ending.mp3'):\n", + " print(\"Either Artwork.png or ending.mp3 file not found.\")\n", + " return\n", + "\n", + " # split the text into lines\n", + " text = markdown_to_text(text).split(\"\\n\")\n", + " # remove empty lines\n", + " text = [t for t in text if t]\n", + "\n", + " for t in text:\n", + " speech_file_path = f'temp-{text.index(t)}.mp3'\n", + " rspd_audio = client.audio.speech.create(\n", + " model=\"tts-1\",\n", + " voice=performer,\n", + " input=t.strip()\n", + " ) \n", + " rspd_audio.stream_to_file(speech_file_path)\n", + " # output a progress percentage \n", + " # keep updating within a line\n", + " print(f\"\\rprocessing: {round((text.index(t)+1)/len(text)*100)}%\", end='...')\n", + " print(\"\\n\")\n", + "\n", + " # create an audio of 1 second of silence\n", + " temp_audio = AudioSegment.silent(duration=1000)\n", + " for t in text:\n", + " seg = AudioSegment.from_file(f'temp-{text.index(t)}.mp3')\n", + " temp_audio += seg + AudioSegment.silent(duration=1500)\n", + " # delete the temp file\n", + " os.remove(f'temp-{text.index(t)}.mp3')\n", + " temp_audio.export('~temp.mp3', format='mp3')\n", + " speech = AudioSegment.from_file('~temp.mp3')\n", + " ending = AudioSegment.from_file('ending.mp3')\n", + " combined = speech + ending\n", + " os.remove('~temp.mp3')\n", + " if filename:\n", + " # if filename has no extension, add .mp3\n", + " if filename.endswith('.mp3'):\n", + " speech_file_path = filename\n", + " else:\n", + " speech_file_path = f'{filename}.mp3' \n", + " else:\n", + " speech_file_path = f'{datetime.now().strftime(\"%Y%m%d_%H%M%S\")}_{performer}.mp3'\n", + " combined.export(speech_file_path, format='mp3')\n", + " print(f\"Audio file saved as {speech_file_path}\")\n", + "\n", + " image_file = 'Artwork.png'\n", + " artist = 'tts'\n", + " album = 'Daily Speech Training'\n", + " genre = 'SPEECH'\n", + "\n", + " add_metadata(speech_file_path, image_file, artist, album, genre)\n", + " IPython.display.Audio(speech_file_path)\n", + "\n", + " return f'{speech_file_path} created successfully.'\n", + "\n" + ] + }, { "cell_type": "markdown", "id": "2df59a42", diff --git a/1000-hours/sounds-of-american-english/4-natural-speech.md b/1000-hours/sounds-of-american-english/4-natural-speech.md index d56330d8..f3901649 100644 --- a/1000-hours/sounds-of-american-english/4-natural-speech.md +++ b/1000-hours/sounds-of-american-english/4-natural-speech.md @@ -1,19 +1,34 @@ # 4. 自然语流 -**音素**是构成自然语流的最基础单位。随后,一个或者多个音素构成**音节**;一个或者多个音节构成**词汇**;一个或者多个词汇构成**句子** —— 而这这基本上就是**语流**的构成过程。 +**音素**是构成自然语流的最基础单位。随后: -在**自然语流**中,50 多个音素各自的变化(或者变体)又多又普遍。 +> * 一个或者多个音素构成**音节**; +> * 一个或者多个音节构成**词汇**; +> * 一个或者多个词汇构成**句子**; +> * 一个或者多个词汇构成**对话**或者**篇章**; -任何一个音素都一样,实际上并不存在一个像音乐音符那样可以 100% 精确的标准。每个音素都有**长短**、**强弱**、**高低**、**起伏**、**轻重**、**缓急**各个维度上并不统一的变化…… +—— 而这这基本上就是**自然语流**的构成过程。 -即便是相同的单词,在同一句话里都常常读法并不完全相同: +在**自然语流**中,每个**音素**都可能存在一定的变化。 -> * **communication**: Her *communication* skills are excellent, but her com*m*unication of the project details needs work. -> * **explanation**: The *explanation* you gave was clear, but I need a more detailed *explanation*. +任何一个音素都一样,实际上并不存在一个像音乐音符那样可以 100% 精确的标准。时时刻刻,每个音素都有**长短**、**强弱**、**高低**、**起伏**、**轻重**、**缓急**等等各个维度上并不统一的变化 —— 不仅如此,说话的每个人又有着各自的特质,包括但不限于无法一致的音质、音域、音量、语速、腔调、情绪,等等等等…… 也正因如此,最终每个人的说话方式都各不相同 —— 实际上是没办法完全相同。其实,不仅英语如此,地球上的所有语言都是如此。 +即便是相同的单词,在同一句话里都常常读法并不完全相同,也无法完全相同 —— 注意两个相同的词的每个音节的**音高**和**声调**的不同: -也正因如此,最终每个人的说话方式都各不相同 —— 实际上是没办法完全相同,因为多个维度组合起来,排列组合的可能性其实完全是天文数字的量级。其实,不仅英语如此,地球上的所有语言都是如此。 +> * **communication**: Her *communication* skills are excellent, but her *communication* of the project details needs work. +> * **explanation**: The *explanation* you gave was clear, but I need a more detailed *explanation*. -接下来,在对 “音素音标” 已经有了足够了解的前提下,我们将按照以下的顺序进行讲解: +1974 年,美国密西根州立大学(Michigan State University)的人工语言实验室的研究人员曾经打电话用机器生成的语音订购一块披萨[^1]…… 以下是机器语音合成 50 年前后的对比: -> 句子 ⭢ 词汇 ⭢ 音节 +> Text: Would you please phrase that question so that I can answer it with yes or no? +> * 1974 +> * Michigan State University +> * 2024 +> * OpenAI TTS (Alloy) +> * Microsoft Edge TTS (en-US-GuyNeural) + +显然,模拟真人的自然语流,并不只是 “把每个音素朗读标准” —— 除此之外需要考虑的因素实在是太多,而各个维度的不同再组合起来就是天文数字的量级…… 乃至于需要将近 50 年的时间,以计算机算力的提高、算法的改良进步为前提,而后还要配合着大规模神经网络以及基于大语言模型的人工智能才有了这样极其接近 “真实”。 + +我们用自己的嗓音说话也是如此。要做的事情,不仅仅是 “把每个音素” 读准读好,也不仅仅是 “把每个单词读得像词典里的真人发音一样”,我们需要从多个维度调整自己 —— 当然很麻烦,不过,事实证明,也的确是能做到做好的事情。 + +[^1]: https://www.youtube.com/watch?v=94d_h_t2QAA \ No newline at end of file