08-20-1635, chapter 4.2

2024-08-20 16:54:07 +08:00
parent d16300fa0a
commit 83ecdf2ceb
3 changed files with 141 additions and 80 deletions
--- a/1000-hours/public/jupyter-notebooks/edge-tts-valcab-pronounciation.ipynb
+++ b/1000-hours/public/jupyter-notebooks/edge-tts-valcab-pronounciation.ipynb
@@ -84,21 +84,10 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 84,
+   "execution_count": null,
   "id": "4146f92e",
   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "['en-US-GuyNeural', 'en-US-AriaNeural']\n",
-      "important\n",
-      "../audios/important-us-male.mp3 created\n",
-      "../audios/important-us-female.mp3 created\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "\n",
    "voices = [\"en-US-GuyNeural\", \"en-US-AriaNeural\", \"en-GB-RyanNeural\", \"en-GB-LibbyNeural\"]\n",
@@ -112,7 +101,9 @@
    "    print(voices)\n",
    "\n",
    "words = \"\"\"\n",
-    "important\n",
+    "hello,\n",
+    "heat,\n",
+    "high,\n",
    "\"\"\"\n",
    "\n",
    "for word in words.strip().split(','):\n",
@@ -126,7 +117,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 69,
+   "execution_count": 63,
   "id": "2d46cde4",
   "metadata": {},
   "outputs": [],
@@ -146,18 +137,17 @@
    "        ) as response:\n",
    "            response.stream_to_file(path)\n",
    "        \n",
-    "sentence = \"It's a very important aspect\"\n",
+    "sentence = \"The explanation you gave was clear but I need a more detailed explanation.\"\n",
    "\n",
    "# remove all punctuation at the end of sentence,\n",
    "# replace all spaces and punctuations in the sentence with dash\n",
-    "# audio_filename_openai = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_openai.mp3'\n",
-    "# audio_filename_msedge = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_msedge.mp3'\n",
-    "audio_filename_openai = sentence.rstrip(\",.?!\").translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\")\n",
-    "audio_filename_msedge = sentence.rstrip(\",.?!\").translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\")\n",
+    "audio_filename_openai = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_openai.mp3'\n",
+    "audio_filename_msedge = sentence.translate(str.maketrans(' ,.?!', '-----')).strip().replace(\"--\", \"-\") + '_msedge.mp3'\n",
    "# get_openai_tts_audio(sentence, audio_filename_openai, performer='alloy')\n",
    "# await generate_edge_tts_audio(sentence, audio_filename_msedge, voice=\"en-US-GuyNeural\", verbose=True, overwrite=True, play=True)\n",
+    "\n",
    "for voice in [\"alloy\", \"nova\"]:\n",
-    "    get_openai_tts_audio(sentence, f'../audios/{audio_filename_openai}-{voice}.mp3', performer=voice)\n"
+    "    get_openai_tts_audio(sentence, f'../audios/{sentence.replace(\" \", \"-\")}-{voice}.mp3', performer=voice)\n"
   ]
  },
  {
@@ -271,62 +261,6 @@
    "* voice = \"en-CA-ClaraNeural\" (Female)\n",
    "* voice = \"en-CA-LiamNeural\" (Male)"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 79,
-   "id": "215d423d",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "<_io.BufferedRandom name='../audios/The-art-of-focus-in-our-whirlwind-existence-can-sometimes-feel-like-searching-for-a-needle-in-a-haystack-all-strong.mp3'>"
-      ]
-     },
-     "execution_count": 79,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "\n",
-    "text = \"\"\"\n",
-    "The art of focus in our whirlwind existence can sometimes feel like searching for a needle in a haystack\n",
-    "\"\"\"\n",
-    "\n",
-    "# 1 second silence with pydub\n",
-    "from pydub import AudioSegment\n",
-    "sentence = AudioSegment.silent(duration=1000)\n",
-    "\n",
-    "for word in text.strip().split(' '):\n",
-    "    w = word.strip().lower()\n",
-    "    if w == \"a\":\n",
-    "        w = \"uh\"\n",
-    "    if len(w) > 0:\n",
-    "        filename = f'../audios/temp-{w.replace(\" \", \"-\")}-{regions[i]}-{genders[i]}.mp3'\n",
-    "        get_openai_tts_audio(w, filename, performer=\"alloy\")\n",
-    "        sentence += AudioSegment.from_file(filename) + AudioSegment.silent(duration=200)\n",
-    "        # remove the temp file\n",
-    "        os.remove(filename)\n",
-    "sentence += AudioSegment.silent(duration=1000)\n",
-    "# save the sentence as a single audio file\n",
-    "sentence.export(f'../audios/{text.strip().replace(\" \",\"-\")}-all-strong.mp3', format='mp3')\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 77,
-   "id": "5a718cf9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "text = \"\"\"\n",
-    "The art of focus in our whirlwind existence can sometimes feel like searching for a needle in a haystack\n",
-    "\"\"\"\n",
-    "filename = f'../audios/{text.strip().replace(\" \",\"-\")}-natural.mp3'\n",
-    "get_openai_tts_audio(text, filename, performer=\"alloy\")"
-   ]
  }
 ],
 "metadata": {
--- a/1000-hours/public/jupyter-notebooks/phonetics.ipynb
+++ b/1000-hours/public/jupyter-notebooks/phonetics.ipynb
@@ -0,0 +1,127 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import json\n",
+    "import vlc\n",
+    "import re\n",
+    "\n",
+    "# 假设你的 JSON 数据库是一个 JSON 文件，我们将从文件中加载数据\n",
+    "# 如果 JSON 数据在内存中或其他格式，你可能需要修改这部分代码\n",
+    "def load_json_database(file_path):\n",
+    "    records = []\n",
+    "    with open(file_path, 'r') as file:\n",
+    "        for line in file:\n",
+    "            try:\n",
+    "                record = json.loads(line)\n",
+    "                records.append(record)\n",
+    "            except json.JSONDecodeError as e:\n",
+    "                print(f\"Error parsing JSON: {e}\")\n",
+    "    return records\n",
+    "\n",
+    "# The rest of the code remains the same...\n",
+    "\n",
+    "# 在 JSON 数据库中检索 word\n",
+    "def search_in_json_database(database, search_word, region):\n",
+    "    for record in database:\n",
+    "        # 检查 word 字段是否匹配\n",
+    "        if record.get('word') == search_word:\n",
+    "            # 找到匹配项后，获取美式发音信息\n",
+    "            pos_items = record.get('pos_items', [])\n",
+    "            for pos_item in pos_items:\n",
+    "                pronunciations = pos_item.get('pronunciations', [])\n",
+    "                for pronunciation in pronunciations:\n",
+    "                    if pronunciation.get('region') == region:\n",
+    "                        # 找到美式发音，返回相关信息\n",
+    "                        return {\n",
+    "                            'pronunciation': pronunciation.get('pronunciation'),\n",
+    "                            'audio': pronunciation.get('audio')\n",
+    "                        }\n",
+    "    # 如果没有找到匹配的 word 字段，返回 'not exist'\n",
+    "    return 'not exist'\n",
+    "\n",
+    "def search_pronunciation(database, pattern):\n",
+    "    # Compile the regex pattern\n",
+    "    regex = re.compile(pattern)\n",
+    "    results = []\n",
+    "    # Search in the database\n",
+    "    for record in database:\n",
+    "        for pos_item in record[\"pos_items\"]:\n",
+    "            for pronunciation in pos_item[\"pronunciations\"]:\n",
+    "                if regex.search(pronunciation[\"pronunciation\"]):\n",
+    "                    if pronunciation.get('region') == 'us':\n",
+    "                        results.append((record[\"word\"], pronunciation[\"pronunciation\"].replace(\".\", \"\"))) # record[\"_id\"][\"$oid\"],\n",
+    "    # Return None if no match found\n",
+    "    return results\n",
+    "\n",
+    "# 用于测试的 JSON 数据库文件路径\n",
+    "json_db_file_path = '/Users/joker/github/camdict/cam_dict.refined.json'\n",
+    "\n",
+    "json_database = load_json_database(json_db_file_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "*balls* <span class=\"pho alt\">bɑːlz</span><span class=\"speak-word-inline\" data-audio-us-male=\"/audios/ballss-us-male.mp3\" data-audio-us-female=\"/audios/ballss-us-female.mp3\"></span>\n",
+      "*cards* <span class=\"pho alt\">kɑrdz</span><span class=\"speak-word-inline\" data-audio-us-male=\"/audios/cardss-us-male.mp3\" data-audio-us-female=\"/audios/cardss-us-female.mp3\"></span>\n"
+     ]
+    }
+   ],
+   "source": [
+    "list = \"\"\"\n",
+    "balls,cards\n",
+    "\"\"\"\n",
+    "\n",
+    "for word in list.split(\",\"):\n",
+    "    word = word.strip().lower()\n",
+    "    result = search_in_json_database(json_database, word, 'us')\n",
+    "    if result != 'not exist':\n",
+    "        pho = result['pronunciation']\n",
+    "    else:\n",
+    "        pho = 'not exist'\n",
+    "    line = f'*{word}* <span class=\"pho alt\">{pho}</span><span class=\"speak-word-inline\" data-audio-us-male=\"/audios/{word}s-us-male.mp3\" data-audio-us-female=\"/audios/{word}s-us-female.mp3\"></span>'\n",
+    "    print(line)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/1000-hours/sounds-of-american-english/4.2-words.md
+++ b/1000-hours/sounds-of-american-english/4.2-words.md
@@ -4,11 +4,11 @@

 ## 4.2.1. 重音、次重音、非重音、弱音

-当一个词汇由一个以上的音节构成之时，其中的某个音节可能带有重音（*stress*），在音标中使用 <span class="pho">ˈ</span> 作为标记。
+重点在于，英文的音节有**重音**、**次重音**、**非重音**的区别，这一点和亚洲语言明显不同。
+
+如果一个单词只有一个音节，单独读出的时候，就当作是**重音**（*stress*）音节读出。而一个多音节词汇中**有且只有一个**重音音节，但，可能还有另外一些音节是**次重音**（*secondary stress*），在音标中使用 <span class="pho">ˌ</span> 作为标记。比如，*serendipity* <span class="pho alt">ˌserənˈdɪpət̬i</span><span class="speak-word-inline" data-audio-us-male="/audios/serendipity-us-male.mp3" data-audio-us-female="/audios/serendipity-us-female.mp3"></span>，有一个**重音**和一个**次重音**。而 *[Pneumonoultramicroscopicsilicovolcanoconiosis](https://en.wikipedia.org/wiki/Pneumonoultramicroscopicsilicovolcanoconiosis)*<span class="speak-word-inline" data-audio-us-female="/audios/En-us-pneumonoultramicroscopicsilicovolcanoconiosis.ogg.mp3"></span> 总计有 19 个音节，其中 7 个是次重音，唯一的重音是 <span class="pho">oʊ</span>……

-**一个多音节词汇中最多只有一个重音音节**，但，可能还有另外一些音节是**次重音**（*secondary stress*），在音标中使用 <span class="pho">ˌ</span> 作为标记。比如，*serendipity* <span class="pho alt">ˌserənˈdɪpət̬i</span><span class="speak-word-inline" data-audio-us-male="/audios/serendipity-us-male.mp3" data-audio-us-female="/audios/serendipity-us-female.mp3"></span>，有一个**重音**和一个**次重音**。而 *[Pneumonoultramicroscopicsilicovolcanoconiosis](https://en.wikipedia.org/wiki/Pneumonoultramicroscopicsilicovolcanoconiosis)* 总计有 19 个音节，其中 7 个是次重音，唯一的重音是 <span class="pho">oʊ</span>……

-另外一个重点在于，英文的音节有**重音**、**次重音**、**非重音**的区别，这一点也和亚洲语言明显不同。

 形象地讲，5 个不分轻重的汉字（或者日文字、韩文字）排在一起大概是这样的：