{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import requests\n",
"import json\n",
"import vlc\n",
"import re\n",
"import random\n",
"from IPython.display import Audio\n",
"\n",
"def load_json_database(url):\n",
" records = []\n",
" try:\n",
" response = requests.get(url)\n",
" response.raise_for_status() # Raise an error for bad status codes\n",
" for line in response.iter_lines(decode_unicode=True):\n",
" if line:\n",
" try:\n",
" record = json.loads(line)\n",
" records.append(record)\n",
" except json.JSONDecodeError as e:\n",
" print(f\"Error parsing JSON: {e}\")\n",
" except requests.exceptions.RequestException as e:\n",
" print(f\"Error fetching data from URL: {e}\")\n",
" return records\n",
"\n",
"def search_in_json_database(database, search_word, region):\n",
" for record in database:\n",
" # 检查 word 字段是否匹配\n",
" if record.get('word') == search_word:\n",
" # 找到匹配项后,获取美式发音信息\n",
" pos_items = record.get('pos_items', [])\n",
" for pos_item in pos_items:\n",
" pronunciations = pos_item.get('pronunciations', [])\n",
" for pronunciation in pronunciations:\n",
" if pronunciation.get('region') == region:\n",
" # 找到美式发音,返回相关信息\n",
" return {\n",
" 'pronunciation': pronunciation.get('pronunciation'),\n",
" 'audio': pronunciation.get('audio')\n",
" }\n",
" # 如果没有找到匹配的 word 字段,返回 'not exist'\n",
" return 'not exist'\n",
"\n",
"url = \"https://raw.githubusercontent.com/zelic91/camdict/main/cam_dict.refined.json\"\n",
"\n",
"json_database = load_json_database(url)\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"之前讲过,非重音音节里的元音可能会被弱化为 schwa ə…… 在自然语流中,连 schwa ə 都可能会被进一步弱化,变成非常轻的 ɤ。比如,常用词 reasonable,/ˈriːzənəbəl/,实际听到的常常是 /ˈriːzɤnəbəl/……\n",
"\n",
"phonetics_not_exist: \n",
"\n",
"\n"
]
}
],
"source": [
"text =\"\"\"\n",
"之前讲过,非重音音节里的元音可能会被弱化为 schwa `ə`…… 在自然语流中,连 schwa `ə` 都可能会被进一步弱化,变成非常轻的 `ɤ`。比如,常用词 reasonable,`/ˈriːzənəbəl/`,实际听到的常常是` /ˈriːzɤnəbəl/`……\n",
"\"\"\"\n",
"\n",
"words = \"\"\n",
"phonetics_not_exist = ''\n",
"lines = text.split(\"\\n\")\n",
"for line in lines:\n",
" line = line.replace(\" `\", \" \")\n",
" line = line.replace(\"`\", \"\")\n",
" if '*' in line:\n",
" line = line.replace('\"', \"**\")\n",
" examples = line.split(\"-\")[1].split(\",\")\n",
" examples = [x.strip() for x in examples]\n",
" line = line.replace(\" - \", \"\\n\")\n",
" # print(examples)\n",
" # wrap examples in span,\n",
" for e in examples:\n",
" # join e in words with ','\n",
" words += e + \",\"\n",
" entry_us = search_in_json_database(json_database, e, 'us')\n",
" if entry_us == 'not exist':\n",
" phonetics = entry_us\n",
" phonetics_not_exist += f'{e},'\n",
" else:\n",
" phonetics = entry_us['pronunciation'] \n",
" wrapped_e = f'\\t- {e} {phonetics} \\n'\n",
" line = line.replace(e, wrapped_e).replace(',', '').strip()\n",
" \n",
"\n",
" print(line)\n",
"print(f'phonetics_not_exist: {phonetics_not_exist}')\n",
"print('\\n'+words)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['reasonable']\n",
"reasonable\n",
"Files created!\n"
]
}
],
"source": [
"import asyncio\n",
"import edge_tts\n",
"import pygame\n",
"text = words.rstrip(\",\")\n",
"Wordlist = text.split(\",\")\n",
"# Wordlist = ['reasonable']\n",
"\n",
"print(Wordlist)\n",
"for w in Wordlist:\n",
" # for VOICE in ['en-US-GuyNeural', 'en-US-JennyNeural', 'en-GB-RyanNeural', 'en-GB-SoniaNeural']:\n",
" for VOICE in ['en-US-GuyNeural', 'en-US-MichelleNeural']:\n",
" w = w.strip()\n",
" # OUTPUT_FILE = f\"{w}-{VOICE.replace('EricNeural', 'Guy-Male').replace('JennyNeural', 'Jenny-Female').replace('RyanNeural', 'Ryan-Male').replace('SoniaNeural', 'Sonia-Female').lower()}.mp3\"\n",
" OUTPUT_FILE = f\"{w}-{VOICE.replace('GuyNeural', 'Male').replace('MichelleNeural', 'Female').replace('en-', '').lower()}.mp3\"\n",
" communicate = edge_tts.Communicate(w, VOICE)\n",
" await communicate.save(OUTPUT_FILE) \n",
" print(w)\n",
"print(\"Files created!\") "
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"举个例子,*ichthyosaur*,这个一看就知道并非常用的词汇,其实很简单,先从表音构成去看,ˈɪk.θi.ə.sɔːr —— 剑桥词典把它划分成了 4 个音节…… 但感觉上,第二第三个音节可以合并,ˈɪk.θiə.sɔːr,*ich* ⭤ ˈɪk, *thyo* ⭤ θiə, *saur* ⭤ sɔːr…… 而从表意的角度去看呢?前半部 *ichthyo-* 的意思是 “与鱼有关的”…… 后半部 *-saur* 是什么意思呢?各种恐龙的 “龙” 都是 -saur 结尾,于是,这个词的意思是 “鱼龙”…… 换言之,这个单词的两个部分,都是拉丁词根词缀,也都是 “既表音又表意” 的,事实上很简单 —— 虽然拼写乍看起来很复杂。\n",
"\n",
"Audio files for ichthyosaur created!\n",
"\n",
"举个例子,*ichthyosaur* ˈɪk.θi.ə.sɔːr \n",
",这个一看就知道并非常用的词汇,其实很简单,先从表音构成去看,ˈɪk.θi.ə.sɔːr —— 剑桥词典把它划分成了 4 个音节…… 但感觉上,第二第三个音节可以合并,ˈɪk.θiə.sɔːr,*ich* ⭤ ˈɪk, *thyo* ⭤ θiə, *saur* ⭤ sɔːr…… 而从表意的角度去看呢?前半部 *ichthyo-* 的意思是 “与鱼有关的”…… 后半部 *-saur* 是什么意思呢?各种恐龙的 “龙” 都是 -saur 结尾,于是,这个词的意思是 “鱼龙”…… 换言之,这个单词的两个部分,都是拉丁词根词缀,也都是 “既表音又表意” 的,事实上很简单 —— 虽然拼写乍看起来很复杂。\n",
"\n",
"Text copied to clipboard!\n"
]
}
],
"source": [
"text =\"\"\"\n",
"举个例子,*ichthyosaur*,这个一看就知道并非常用的词汇,其实很简单,先从表音构成去看,`/ˈɪk.θi.ə.sɔːr/` —— 剑桥词典把它划分成了 4 个音节…… 但感觉上,第二第三个音节可以合并,`/ˈɪk.θiə.sɔːr/`,*ich* ⭤ `/ˈɪk/`, *thyo* ⭤ `/θiə/`, *saur* ⭤ `/sɔːr/`…… 而从表意的角度去看呢?前半部 *ichthyo-* 的意思是 “与鱼有关的”…… 后半部 *-saur* 是什么意思呢?各种恐龙的 “龙” 都是 -saur 结尾,于是,这个词的意思是 “鱼龙”…… 换言之,这个单词的两个部分,都是拉丁词根词缀,也都是 “既表音又表意” 的,事实上很简单 —— 虽然拼写乍看起来很复杂。\n",
"\"\"\"\n",
"\n",
"sound_files = \"ichthyosaur\".split(\",\")\n",
"# regex, replace `...` with ...\n",
"import re\n",
"text = text.replace(\"/\", \"\")\n",
"text = re.sub(r'`([^`]+)`', r'\\1', text)\n",
"print(text)\n",
"\n",
"# get sound files\n",
"for s in sound_files:\n",
" w = s.strip()\n",
" text = text.replace(f\"*{w}*\", w)\n",
" # get the audio file\n",
"\n",
" import asyncio\n",
" import edge_tts\n",
" import pygame\n",
" for VOICE in ['en-US-GuyNeural', 'en-US-MichelleNeural']:\n",
" # OUTPUT_FILE = f\"{w}-{VOICE.replace('EricNeural', 'Guy-Male').replace('JennyNeural', 'Jenny-Female').replace('RyanNeural', 'Ryan-Male').replace('SoniaNeural', 'Sonia-Female').lower()}.mp3\"\n",
" OUTPUT_FILE = f\"{w}-{VOICE.replace('GuyNeural', 'Male').replace('MichelleNeural', 'Female').replace('en-', '').lower()}.mp3\"\n",
" communicate = edge_tts.Communicate(w, VOICE)\n",
" await communicate.save(OUTPUT_FILE) \n",
" print(f\"Audio files for {w} created!\") \n",
"\n",
" entry_us = search_in_json_database(json_database, w, 'us')\n",
" if entry_us == 'not exist':\n",
" phonetics = entry_us\n",
" else:\n",
" phonetics = entry_us['pronunciation'] \n",
" wrapped_w = f'*{w}* {phonetics} \\n'\n",
" text = text.replace(w, wrapped_w)\n",
" # get phonetics for the word\n",
"\n",
"print(text)\n",
"\n",
"# send text to clipboard\n",
"import pyperclip\n",
"pyperclip.copy(text.replace(\"\\n\", \"\"))\n",
"print(\"Text copied to clipboard!\")"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting pyperclip\n",
" Downloading pyperclip-1.9.0.tar.gz (20 kB)\n",
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25hBuilding wheels for collected packages: pyperclip\n",
" Building wheel for pyperclip (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for pyperclip: filename=pyperclip-1.9.0-py3-none-any.whl size=11002 sha256=b07922d96d27e2cc0dce4f31dd18f85c90c5f4b9298ca359a6ad9a13494461d6\n",
" Stored in directory: /Users/joker/Library/Caches/pip/wheels/e0/e8/fc/8ab8aa326e33bc066ccd5f3ca9646eab4299881af933f94f09\n",
"Successfully built pyperclip\n",
"Installing collected packages: pyperclip\n",
"Successfully installed pyperclip-1.9.0\n"
]
}
],
"source": [
"!pip install pyperclip"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1. **airplane** ˈer.pleɪn \n",
"2. **airport** ˈer.pɔːrt \n",
"3. **backyard** ˌbækˈjɑːrd \n",
"4. **bedroom** ˈbed.ruːm \n",
"5. **birthday** ˈbɝːθ.deɪ \n",
"6. **blackboard** ˈblæk.bɔːrd \n",
"7. **bookstore** ˈbʊk.stɔːr \n",
"8. **brainstorm** ˈbreɪn.stɔːrm \n",
"9. **breakfast** ˈbrek.fəst \n",
"10. **classroom** ˈklæs.ruːm \n",
"11. **cupcake** ˈkʌp.keɪk \n",
"12. **daydream** ˈdeɪ.driːm \n",
"13. **dishwasher** ˈdɪʃˌwɑː.ʃɚ \n",
"14. **doorbell** ˈdɔːr.bel \n",
"15. **downtown** ˌdaʊnˈtaʊn \n",
"16. **earthquake** ˈɝːθ.kweɪk \n",
"17. **everyday** ˈev.ri.deɪ \n",
"18. **eyewitness** ˈaɪˌwɪt.nəs \n",
"19. **firefighter** ˈfaɪrˌfaɪ.t̬ɚ \n",
"20. **football** ˈfʊt.bɑːl \n",
"21. **greenhouse** ˈɡriːn.haʊs \n",
"22. **handwriting** ˈhændˌraɪ.t̬ɪŋ \n",
"23. **headache** ˈhed.eɪk \n",
"24. **highway** ˈhaɪ.weɪ \n",
"25. **homework** ˈhoʊm.wɝːk \n",
"26. **iceberg** ˈaɪs.bɝːɡ \n",
"27. **jellyfish** ˈdʒel.i.fɪʃ \n",
"28. **laptop** ˈlæp.tɑːp \n",
"29. **lighthouse** ˈlaɪt.haʊs \n",
"30. **mailbox** ˈmeɪl.bɑːks \n",
"31. **moonlight** ˈmuːn.laɪt \n",
"32. **notebook** ˈnoʊt.bʊk \n",
"33. **nobody** ˈnoʊ.bɑː.di \n",
"34. **pancake** ˈpæn.keɪk \n",
"35. **postcard** ˈpoʊst.kɑːrd \n",
"36. **rainbow** ˈreɪn.boʊ \n",
"37. **sailboat** ˈseɪl.boʊt \n",
"38. **sandbox** ˈsænd.bɑːks \n",
"39. **seashore** ˈsiː.ʃɔːr \n",
"40. **skateboard** ˈskeɪt.bɔːrd \n",
"41. **snowflake** ˈsnoʊ.fleɪk \n",
"42. **spaceship** ˈspeɪs.ʃɪp \n",
"43. **sunflower** ˈsʌnˌflaʊ.ɚ \n",
"44. **sunshine** ˈsʌn.ʃaɪn \n",
"45. **superhero** ˈsuː.pɚˌhɪr.oʊ \n",
"46. **tablecloth** ˈteɪ.bəl.klɑːθ \n",
"47. **toothbrush** ˈtuːθ.brʌʃ \n",
"48. **toothpaste** ˈtuːθ.peɪst \n",
"49. **typewriter** ˈtaɪpˌraɪ.t̬ɚ \n",
"50. **underwater** ˌʌn.dɚˈwɑː.t̬ɚ \n",
"51. **upstairs** ʌpˈsterz \n",
"52. **volleyball** ˈvɑː.li.bɑːl \n",
"53. **waterfall** ˈwɑː.t̬ɚ.fɑːl \n",
"54. **watermelon** ˈwɑː.t̬ɚˌmel.ən \n",
"55. **weekend** ˈwiːk.end \n",
"56. **wheelchair** ˈwiːl.tʃer \n",
"57. **windmill** ˈwɪnd.mɪl \n",
"58. **workshop** ˈwɝːk.ʃɑːp \n"
]
}
],
"source": [
"list = \"\"\"\n",
"1. **airplane**\n",
"2. **airport**\n",
"3. **backyard**\n",
"4. **bedroom**\n",
"5. **birthday**\n",
"6. **blackboard**\n",
"7. **bookstore**\n",
"8. **brainstorm**\n",
"9. **breakfast**\n",
"10. **classroom**\n",
"11. **cupcake**\n",
"12. **daydream**\n",
"13. **dishwasher**\n",
"14. **doorbell**\n",
"15. **downtown**\n",
"16. **earthquake**\n",
"17. **everyday**\n",
"18. **eyewitness**\n",
"19. **firefighter**\n",
"20. **football**\n",
"21. **greenhouse**\n",
"22. **handwriting**\n",
"23. **headache**\n",
"24. **highway**\n",
"25. **homework**\n",
"26. **iceberg**\n",
"27. **jellyfish**\n",
"28. **laptop**\n",
"29. **lighthouse**\n",
"30. **mailbox**\n",
"31. **moonlight**\n",
"32. **notebook**\n",
"33. **nobody**\n",
"34. **pancake**\n",
"35. **postcard**\n",
"36. **rainbow**\n",
"37. **sailboat**\n",
"38. **sandbox**\n",
"39. **seashore**\n",
"40. **skateboard**\n",
"41. **snowflake**\n",
"42. **spaceship**\n",
"43. **sunflower**\n",
"44. **sunshine**\n",
"45. **superhero**\n",
"46. **tablecloth**\n",
"47. **toothbrush**\n",
"48. **toothpaste**\n",
"49. **typewriter**\n",
"50. **underwater**\n",
"51. **upstairs**\n",
"52. **volleyball**\n",
"53. **waterfall**\n",
"54. **watermelon**\n",
"55. **weekend**\n",
"56. **wheelchair**\n",
"57. **windmill**\n",
"58. **workshop**\n",
"\"\"\"\n",
"\n",
"lines = list.split(\"\\n\")\n",
"for l in lines:\n",
" if l.strip() == \"\":\n",
" continue\n",
" # extract str between ** and **\n",
" word = re.search(r'\\*\\*(.*)\\*\\*', l).group(1)\n",
"\n",
" import asyncio\n",
" import edge_tts\n",
" import pygame\n",
" for VOICE in ['en-US-GuyNeural', 'en-US-MichelleNeural']:\n",
" # OUTPUT_FILE = f\"{w}-{VOICE.replace('EricNeural', 'Guy-Male').replace('JennyNeural', 'Jenny-Female').replace('RyanNeural', 'Ryan-Male').replace('SoniaNeural', 'Sonia-Female').lower()}.mp3\"\n",
" OUTPUT_FILE = f\"{word}-{VOICE.replace('GuyNeural', 'Male').replace('MichelleNeural', 'Female').replace('en-', '').lower()}.mp3\"\n",
" communicate = edge_tts.Communicate(word, VOICE)\n",
" await communicate.save(OUTPUT_FILE) \n",
" # print(f\"Audio files for {word} created!\") \n",
"\n",
" entry_us = search_in_json_database(json_database, word, 'us')\n",
" if entry_us == 'not exist':\n",
" phonetics = entry_us\n",
" else:\n",
" phonetics = entry_us['pronunciation'] \n",
" wrapped_p = f' {phonetics} '\n",
" l += wrapped_p\n",
"\n",
" print(l)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}