08-20-0951, chapter 4

This commit is contained in:
xiaolai
2024-08-20 09:51:14 +08:00
parent c575d0ba0f
commit d7808b3785
9 changed files with 162 additions and 87 deletions

View File

@@ -29,7 +29,7 @@
},
{
"cell_type": "code",
"execution_count": 42,
"execution_count": 49,
"id": "71d35cd9",
"metadata": {},
"outputs": [
@@ -38,81 +38,17 @@
"output_type": "stream",
"text": [
"['en-US-GuyNeural', 'en-US-AriaNeural']\n",
"ma\n",
"../audios/ma-us-male.mp3 created\n",
"../audios/ma-us-female.mp3 created\n",
"hello\n",
"../audios/hello-us-male.mp3 created\n",
"../audios/hello-us-female.mp3 created\n",
"\n",
"room\n",
"../audios/room-us-male.mp3 created\n",
"../audios/room-us-female.mp3 created\n",
"heat\n",
"../audios/heat-us-male.mp3 created\n",
"../audios/heat-us-female.mp3 created\n",
"\n",
"conversation\n",
"../audios/conversation-us-male.mp3 created\n",
"../audios/conversation-us-female.mp3 created\n",
"\n",
"army\n",
"../audios/army-us-male.mp3 created\n",
"../audios/army-us-female.mp3 created\n",
"\n",
"mob\n",
"../audios/mob-us-male.mp3 created\n",
"../audios/mob-us-female.mp3 created\n",
"\n",
"mom\n",
"../audios/mom-us-male.mp3 created\n",
"../audios/mom-us-female.mp3 created\n",
"\n",
"mind\n",
"../audios/mind-us-male.mp3 created\n",
"../audios/mind-us-female.mp3 created\n",
"\n",
"night\n",
"../audios/night-us-male.mp3 created\n",
"../audios/night-us-female.mp3 created\n",
"\n",
"nine\n",
"../audios/nine-us-male.mp3 created\n",
"../audios/nine-us-female.mp3 created\n",
"\n",
"know\n",
"../audios/know-us-male.mp3 created\n",
"../audios/know-us-female.mp3 created\n",
"\n",
"knight\n",
"../audios/knight-us-male.mp3 created\n",
"../audios/knight-us-female.mp3 created\n",
"\n",
"gnaw\n",
"../audios/gnaw-us-male.mp3 created\n",
"../audios/gnaw-us-female.mp3 created\n",
"\n",
"gnome\n",
"../audios/gnome-us-male.mp3 created\n",
"../audios/gnome-us-female.mp3 created\n",
"\n",
"anchor\n",
"../audios/anchor-us-male.mp3 created\n",
"../audios/anchor-us-female.mp3 created\n",
"\n",
"bank\n",
"../audios/bank-us-male.mp3 created\n",
"../audios/bank-us-female.mp3 created\n",
"\n",
"thank\n",
"../audios/thank-us-male.mp3 exists, skipping...\n",
"../audios/thank-us-female.mp3 exists, skipping...\n",
"\n",
"bang\n",
"../audios/bang-us-male.mp3 created\n",
"../audios/bang-us-female.mp3 created\n",
"\n",
"long\n",
"../audios/long-us-male.mp3 created\n",
"../audios/long-us-female.mp3 created\n",
"\n",
"sing\n",
"../audios/sing-us-male.mp3 created\n",
"../audios/sing-us-female.mp3 created\n",
"high\n",
"../audios/high-us-male.mp3 created\n",
"../audios/high-us-female.mp3 created\n",
"\n"
]
}
@@ -143,9 +79,16 @@
" if verbose:\n",
" print(f'{file_name} created')\n",
" \n",
" time.sleep(1.5)\n",
"\n",
"\n",
" time.sleep(1.5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4146f92e",
"metadata": {},
"outputs": [],
"source": [
"\n",
"voices = [\"en-US-GuyNeural\", \"en-US-AriaNeural\", \"en-GB-RyanNeural\", \"en-GB-LibbyNeural\"]\n",
"regions = ['us', 'us', 'uk', 'uk']\n",
@@ -158,7 +101,9 @@
" print(voices)\n",
"\n",
"words = \"\"\"\n",
"sam\n",
"hello,\n",
"heat,\n",
"high,\n",
"\"\"\"\n",
"\n",
"for word in words.strip().split(','):\n",
@@ -170,6 +115,121 @@
" await generate_edge_tts_audio(w, filename, voice=voice, verbose=True, overwrite=False, play=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 63,
"id": "2d46cde4",
"metadata": {},
"outputs": [],
"source": [
"def get_openai_tts_audio(text, path, performer='alloy'):\n",
" \n",
" from openai import OpenAI\n",
" from dotenv import load_dotenv\n",
" load_dotenv()\n",
" client = OpenAI(\n",
" )\n",
" \n",
" with client.audio.speech.with_streaming_response.create(\n",
" model=\"tts-1\",\n",
" voice=performer,\n",
" input=text.strip()\n",
" ) as response:\n",
" response.stream_to_file(path)\n",
" \n",
"sentence = \"The explanation you gave was clear but I need a more detailed explanation.\"\n",
"\n",
"# remove all punctuation at the end of sentence,\n",
"# replace all spaces and punctuations in the sentence with dash\n",
"audio_filename_openai = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_openai.mp3'\n",
"audio_filename_msedge = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_msedge.mp3'\n",
"# get_openai_tts_audio(sentence, audio_filename_openai, performer='alloy')\n",
"# await generate_edge_tts_audio(sentence, audio_filename_msedge, voice=\"en-US-GuyNeural\", verbose=True, overwrite=True, play=True)\n",
"\n",
"for voice in [\"alloy\", \"nova\"]:\n",
" get_openai_tts_audio(sentence, f'../audios/{sentence.replace(\" \", \"-\")}-{voice}.mp3', performer=voice)\n"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "7f219eb1",
"metadata": {},
"outputs": [],
"source": [
"from openai import OpenAI\n",
"import os\n",
"import IPython\n",
"from datetime import datetime\n",
"from mutagen.mp3 import MP3\n",
"from mutagen.id3 import ID3, APIC, TPE1, TALB, TCON\n",
"from dotenv import load_dotenv\n",
"from pydub import AudioSegment\n",
"\n",
"load_dotenv()\n",
"client = OpenAI(\n",
")\n",
"\n",
"def get_openai_tts_audio(text, filename, performer=\"alloy\"):\n",
"\n",
" # check artwork.png and ending.mp3 files exist\n",
" if not os.path.isfile('Artwork.png') or not os.path.isfile('ending.mp3'):\n",
" print(\"Either Artwork.png or ending.mp3 file not found.\")\n",
" return\n",
"\n",
" # split the text into lines\n",
" text = markdown_to_text(text).split(\"\\n\")\n",
" # remove empty lines\n",
" text = [t for t in text if t]\n",
"\n",
" for t in text:\n",
" speech_file_path = f'temp-{text.index(t)}.mp3'\n",
" rspd_audio = client.audio.speech.create(\n",
" model=\"tts-1\",\n",
" voice=performer,\n",
" input=t.strip()\n",
" ) \n",
" rspd_audio.stream_to_file(speech_file_path)\n",
" # output a progress percentage \n",
" # keep updating within a line\n",
" print(f\"\\rprocessing: {round((text.index(t)+1)/len(text)*100)}%\", end='...')\n",
" print(\"\\n\")\n",
"\n",
" # create an audio of 1 second of silence\n",
" temp_audio = AudioSegment.silent(duration=1000)\n",
" for t in text:\n",
" seg = AudioSegment.from_file(f'temp-{text.index(t)}.mp3')\n",
" temp_audio += seg + AudioSegment.silent(duration=1500)\n",
" # delete the temp file\n",
" os.remove(f'temp-{text.index(t)}.mp3')\n",
" temp_audio.export('~temp.mp3', format='mp3')\n",
" speech = AudioSegment.from_file('~temp.mp3')\n",
" ending = AudioSegment.from_file('ending.mp3')\n",
" combined = speech + ending\n",
" os.remove('~temp.mp3')\n",
" if filename:\n",
" # if filename has no extension, add .mp3\n",
" if filename.endswith('.mp3'):\n",
" speech_file_path = filename\n",
" else:\n",
" speech_file_path = f'{filename}.mp3' \n",
" else:\n",
" speech_file_path = f'{datetime.now().strftime(\"%Y%m%d_%H%M%S\")}_{performer}.mp3'\n",
" combined.export(speech_file_path, format='mp3')\n",
" print(f\"Audio file saved as {speech_file_path}\")\n",
"\n",
" image_file = 'Artwork.png'\n",
" artist = 'tts'\n",
" album = 'Daily Speech Training'\n",
" genre = 'SPEECH'\n",
"\n",
" add_metadata(speech_file_path, image_file, artist, album, genre)\n",
" IPython.display.Audio(speech_file_path)\n",
"\n",
" return f'{speech_file_path} created successfully.'\n",
"\n"
]
},
{
"cell_type": "markdown",
"id": "2df59a42",

View File

@@ -1,19 +1,34 @@
# 4. 自然语流
**音素**是构成自然语流的最基础单位。随后,一个或者多个音素构成**音节**;一个或者多个音节构成**词汇**;一个或者多个词汇构成**句子** —— 而这这基本上就是**语流**的构成过程。
**音素**是构成自然语流的最基础单位。随后
在**自然语流**中50 多个音素各自的变化(或者变体)又多又普遍。
> * 一个或者多个音素构成**音节**
> * 一个或者多个音节构成**词汇**
> * 一个或者多个词汇构成**句子**
> * 一个或者多个词汇构成**对话**或者**篇章**
任何一个音素都一样,实际上并不存在一个像音乐音符那样可以 100% 精确的标准。每个音素都有**长短**、**强弱**、**高低**、**起伏**、**轻重**、**缓急**各个维度上并不统一的变化……
—— 而这这基本上就是**自然语流**的构成过程。
即便是相同的单词,在同一句话里都常常读法并不完全相同:
在**自然语流**中,每个**音素**都可能存在一定的变化。
> * **communication**: Her *communication* skills are excellent, but her com*m*unication of the project details needs work.
> * **explanation**: The *explanation* you gave was clear, but I need a more detailed *explanation*.
任何一个音素都一样,实际上并不存在一个像音乐音符那样可以 100% 精确的标准。时时刻刻,每个音素都有**长短**、**强弱**、**高低**、**起伏**、**轻重**、**缓急**等等各个维度上并不统一的变化 —— 不仅如此,说话的每个人又有着各自的特质,包括但不限于无法一致的音质、音域、音量、语速、腔调、情绪,等等等等…… 也正因如此,最终每个人的说话方式都各不相同 —— 实际上是没办法完全相同。其实,不仅英语如此,地球上的所有语言都是如此。
即便是相同的单词,在同一句话里都常常读法并不完全相同,也无法完全相同 —— 注意两个相同的词的每个音节的**音高**和**声调**的不同:
也正因如此,最终每个人的说话方式都各不相同 —— 实际上是没办法完全相同,因为多个维度组合起来,排列组合的可能性其实完全是天文数字的量级。其实,不仅英语如此,地球上的所有语言都是如此。
> * **communication**: Her *communication* skills are excellent, but her *communication* of the project details needs work.<span class="speak-word-inline" data-audio-us-male="/audios/Her-communication-skills-are-excellent-but-her-communication-of-the-project-details-needs-work-alloy.mp3" data-audio-us-female="/audios/Her-communication-skills-are-excellent-but-her-communication-of-the-project-details-needs-work-nova.mp3"></span>
> * **explanation**: The *explanation* you gave was clear, but I need a more detailed *explanation*.<span class="speak-word-inline" data-audio-us-male="/audios/The-explanation-you-gave-was-clear-but-I-need-a-more-detailed-explanation-alloy.mp3" data-audio-us-female="/audios/The-explanation-you-gave-was-clear-but-I-need-a-more-detailed-explanation-nova.mp3"></span>
接下来,在对 “音素音标” 已经有了足够了解的前提下,我们将按照以下的顺序进行讲解
1974 年美国密西根州立大学Michigan State University的人工语言实验室的研究人员曾经打电话用机器生成的语音订购一块披萨[^1]…… 以下是机器语音合成 50 年前后的对比
> 句子 ⭢ 词汇 ⭢ 音节
> Text: Would you please phrase that question so that I can answer it with yes or no?
> * 1974
> * Michigan State University <span class="speak-word-inline" data-audio-us-male="/audios/segment-donald-sherman-ordered-a-pizza.mp3"></span>
> * 2024
> * OpenAI TTS (Alloy)<span class="speak-word-inline" data-audio-us-male="/audios/Would-you-please-phrase-that-question-so-that-I-can-answer-it-with-yes-or-no_openai.mp3"></span>
> * Microsoft Edge TTS (en-US-GuyNeural)<span class="speak-word-inline" data-audio-us-male="/audios/Would-you-please-phrase-that-question-so-that-I-can-answer-it-with-yes-or-no_msedge.mp3"></span>
显然,模拟真人的自然语流,并不只是 “把每个音素朗读标准” —— 除此之外需要考虑的因素实在是太多,而各个维度的不同再组合起来就是天文数字的量级…… 乃至于需要将近 50 年的时间,以计算机算力的提高、算法的改良进步为前提,而后还要配合着大规模神经网络以及基于大语言模型的人工智能才有了这样极其接近 “真实”。
我们用自己的嗓音说话也是如此。要做的事情,不仅仅是 “把每个音素” 读准读好,也不仅仅是 “把每个单词读得像词典里的真人发音一样”,我们需要从多个维度调整自己 —— 当然很麻烦,不过,事实证明,也的确是能做到做好的事情。
[^1]: https://www.youtube.com/watch?v=94d_h_t2QAA