{ "cells": [ { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import sys\n", "import requests\n", "import json\n", "import vlc\n", "import re\n", "import random\n", "from IPython.display import Audio\n", "\n", "def load_json_database(url):\n", " records = []\n", " try:\n", " response = requests.get(url)\n", " response.raise_for_status() # Raise an error for bad status codes\n", " for line in response.iter_lines(decode_unicode=True):\n", " if line:\n", " try:\n", " record = json.loads(line)\n", " records.append(record)\n", " except json.JSONDecodeError as e:\n", " print(f\"Error parsing JSON: {e}\")\n", " except requests.exceptions.RequestException as e:\n", " print(f\"Error fetching data from URL: {e}\")\n", " return records\n", "\n", "def search_in_json_database(database, search_word, region):\n", " for record in database:\n", " # 检查 word 字段是否匹配\n", " if record.get('word') == search_word:\n", " # 找到匹配项后,获取美式发音信息\n", " pos_items = record.get('pos_items', [])\n", " for pos_item in pos_items:\n", " pronunciations = pos_item.get('pronunciations', [])\n", " for pronunciation in pronunciations:\n", " if pronunciation.get('region') == region:\n", " # 找到美式发音,返回相关信息\n", " return {\n", " 'pronunciation': pronunciation.get('pronunciation'),\n", " 'audio': pronunciation.get('audio')\n", " }\n", " # 如果没有找到匹配的 word 字段,返回 'not exist'\n", " return 'not exist'\n", "\n", "url = \"https://raw.githubusercontent.com/zelic91/camdict/main/cam_dict.refined.json\"\n", "\n", "json_database = load_json_database(url)\n" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "之前讲过,非重音音节里的元音可能会被弱化为 schwa ə…… 在自然语流中,连 schwa ə 都可能会被进一步弱化,变成非常轻的 ɤ。比如,常用词 reasonable,/ˈriːzənəbəl/,实际听到的常常是 /ˈriːzɤnəbəl/……\n", "\n", "phonetics_not_exist: \n", "\n", "\n" ] } ], "source": [ "text =\"\"\"\n", "之前讲过,非重音音节里的元音可能会被弱化为 schwa `ə`…… 在自然语流中,连 schwa `ə` 都可能会被进一步弱化,变成非常轻的 `ɤ`。比如,常用词 reasonable,`/ˈriːzənəbəl/`,实际听到的常常是` /ˈriːzɤnəbəl/`……\n", "\"\"\"\n", "\n", "words = \"\"\n", "phonetics_not_exist = ''\n", "lines = text.split(\"\\n\")\n", "for line in lines:\n", " line = line.replace(\" `\", \" \")\n", " line = line.replace(\"`\", \"\")\n", " if '*' in line:\n", " line = line.replace('\"', \"**\")\n", " examples = line.split(\"-\")[1].split(\",\")\n", " examples = [x.strip() for x in examples]\n", " line = line.replace(\" - \", \"\\n\")\n", " # print(examples)\n", " # wrap examples in span,\n", " for e in examples:\n", " # join e in words with ','\n", " words += e + \",\"\n", " entry_us = search_in_json_database(json_database, e, 'us')\n", " if entry_us == 'not exist':\n", " phonetics = entry_us\n", " phonetics_not_exist += f'{e},'\n", " else:\n", " phonetics = entry_us['pronunciation'] \n", " wrapped_e = f'\\t- {e} {phonetics} \\n'\n", " line = line.replace(e, wrapped_e).replace(',', '').strip()\n", " \n", "\n", " print(line)\n", "print(f'phonetics_not_exist: {phonetics_not_exist}')\n", "print('\\n'+words)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['reasonable']\n", "reasonable\n", "Files created!\n" ] } ], "source": [ "import asyncio\n", "import edge_tts\n", "import pygame\n", "text = words.rstrip(\",\")\n", "Wordlist = text.split(\",\")\n", "# Wordlist = ['reasonable']\n", "\n", "print(Wordlist)\n", "for w in Wordlist:\n", " # for VOICE in ['en-US-GuyNeural', 'en-US-JennyNeural', 'en-GB-RyanNeural', 'en-GB-SoniaNeural']:\n", " for VOICE in ['en-US-GuyNeural', 'en-US-MichelleNeural']:\n", " w = w.strip()\n", " # OUTPUT_FILE = f\"{w}-{VOICE.replace('EricNeural', 'Guy-Male').replace('JennyNeural', 'Jenny-Female').replace('RyanNeural', 'Ryan-Male').replace('SoniaNeural', 'Sonia-Female').lower()}.mp3\"\n", " OUTPUT_FILE = f\"{w}-{VOICE.replace('GuyNeural', 'Male').replace('MichelleNeural', 'Female').replace('en-', '').lower()}.mp3\"\n", " communicate = edge_tts.Communicate(w, VOICE)\n", " await communicate.save(OUTPUT_FILE) \n", " print(w)\n", "print(\"Files created!\") " ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "举个例子,*ichthyosaur*,这个一看就知道并非常用的词汇,其实很简单,先从表音构成去看,ˈɪk.θi.ə.sɔːr —— 剑桥词典把它划分成了 4 个音节…… 但感觉上,第二第三个音节可以合并,ˈɪk.θiə.sɔːr,*ich* ⭤ ˈɪk, *thyo* ⭤ θiə, *saur* ⭤ sɔːr…… 而从表意的角度去看呢?前半部 *ichthyo-* 的意思是 “与鱼有关的”…… 后半部 *-saur* 是什么意思呢?各种恐龙的 “龙” 都是 -saur 结尾,于是,这个词的意思是 “鱼龙”…… 换言之,这个单词的两个部分,都是拉丁词根词缀,也都是 “既表音又表意” 的,事实上很简单 —— 虽然拼写乍看起来很复杂。\n", "\n", "Audio files for ichthyosaur created!\n", "\n", "举个例子,*ichthyosaur* ˈɪk.θi.ə.sɔːr \n", ",这个一看就知道并非常用的词汇,其实很简单,先从表音构成去看,ˈɪk.θi.ə.sɔːr —— 剑桥词典把它划分成了 4 个音节…… 但感觉上,第二第三个音节可以合并,ˈɪk.θiə.sɔːr,*ich* ⭤ ˈɪk, *thyo* ⭤ θiə, *saur* ⭤ sɔːr…… 而从表意的角度去看呢?前半部 *ichthyo-* 的意思是 “与鱼有关的”…… 后半部 *-saur* 是什么意思呢?各种恐龙的 “龙” 都是 -saur 结尾,于是,这个词的意思是 “鱼龙”…… 换言之,这个单词的两个部分,都是拉丁词根词缀,也都是 “既表音又表意” 的,事实上很简单 —— 虽然拼写乍看起来很复杂。\n", "\n", "Text copied to clipboard!\n" ] } ], "source": [ "text =\"\"\"\n", "举个例子,*ichthyosaur*,这个一看就知道并非常用的词汇,其实很简单,先从表音构成去看,`/ˈɪk.θi.ə.sɔːr/` —— 剑桥词典把它划分成了 4 个音节…… 但感觉上,第二第三个音节可以合并,`/ˈɪk.θiə.sɔːr/`,*ich* ⭤ `/ˈɪk/`, *thyo* ⭤ `/θiə/`, *saur* ⭤ `/sɔːr/`…… 而从表意的角度去看呢?前半部 *ichthyo-* 的意思是 “与鱼有关的”…… 后半部 *-saur* 是什么意思呢?各种恐龙的 “龙” 都是 -saur 结尾,于是,这个词的意思是 “鱼龙”…… 换言之,这个单词的两个部分,都是拉丁词根词缀,也都是 “既表音又表意” 的,事实上很简单 —— 虽然拼写乍看起来很复杂。\n", "\"\"\"\n", "\n", "sound_files = \"ichthyosaur\".split(\",\")\n", "# regex, replace `...` with ...\n", "import re\n", "text = text.replace(\"/\", \"\")\n", "text = re.sub(r'`([^`]+)`', r'\\1', text)\n", "print(text)\n", "\n", "# get sound files\n", "for s in sound_files:\n", " w = s.strip()\n", " text = text.replace(f\"*{w}*\", w)\n", " # get the audio file\n", "\n", " import asyncio\n", " import edge_tts\n", " import pygame\n", " for VOICE in ['en-US-GuyNeural', 'en-US-MichelleNeural']:\n", " # OUTPUT_FILE = f\"{w}-{VOICE.replace('EricNeural', 'Guy-Male').replace('JennyNeural', 'Jenny-Female').replace('RyanNeural', 'Ryan-Male').replace('SoniaNeural', 'Sonia-Female').lower()}.mp3\"\n", " OUTPUT_FILE = f\"{w}-{VOICE.replace('GuyNeural', 'Male').replace('MichelleNeural', 'Female').replace('en-', '').lower()}.mp3\"\n", " communicate = edge_tts.Communicate(w, VOICE)\n", " await communicate.save(OUTPUT_FILE) \n", " print(f\"Audio files for {w} created!\") \n", "\n", " entry_us = search_in_json_database(json_database, w, 'us')\n", " if entry_us == 'not exist':\n", " phonetics = entry_us\n", " else:\n", " phonetics = entry_us['pronunciation'] \n", " wrapped_w = f'*{w}* {phonetics} \\n'\n", " text = text.replace(w, wrapped_w)\n", " # get phonetics for the word\n", "\n", "print(text)\n", "\n", "# send text to clipboard\n", "import pyperclip\n", "pyperclip.copy(text.replace(\"\\n\", \"\"))\n", "print(\"Text copied to clipboard!\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting pyperclip\n", " Downloading pyperclip-1.9.0.tar.gz (20 kB)\n", " Preparing metadata (setup.py) ... \u001b[?25ldone\n", "\u001b[?25hBuilding wheels for collected packages: pyperclip\n", " Building wheel for pyperclip (setup.py) ... \u001b[?25ldone\n", "\u001b[?25h Created wheel for pyperclip: filename=pyperclip-1.9.0-py3-none-any.whl size=11002 sha256=b07922d96d27e2cc0dce4f31dd18f85c90c5f4b9298ca359a6ad9a13494461d6\n", " Stored in directory: /Users/joker/Library/Caches/pip/wheels/e0/e8/fc/8ab8aa326e33bc066ccd5f3ca9646eab4299881af933f94f09\n", "Successfully built pyperclip\n", "Installing collected packages: pyperclip\n", "Successfully installed pyperclip-1.9.0\n" ] } ], "source": [ "!pip install pyperclip" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1. **airplane** ˈer.pleɪn \n", "2. **airport** ˈer.pɔːrt \n", "3. **backyard** ˌbækˈjɑːrd \n", "4. **bedroom** ˈbed.ruːm \n", "5. **birthday** ˈbɝːθ.deɪ \n", "6. **blackboard** ˈblæk.bɔːrd \n", "7. **bookstore** ˈbʊk.stɔːr \n", "8. **brainstorm** ˈbreɪn.stɔːrm \n", "9. **breakfast** ˈbrek.fəst \n", "10. **classroom** ˈklæs.ruːm \n", "11. **cupcake** ˈkʌp.keɪk \n", "12. **daydream** ˈdeɪ.driːm \n", "13. **dishwasher** ˈdɪʃˌwɑː.ʃɚ \n", "14. **doorbell** ˈdɔːr.bel \n", "15. **downtown** ˌdaʊnˈtaʊn \n", "16. **earthquake** ˈɝːθ.kweɪk \n", "17. **everyday** ˈev.ri.deɪ \n", "18. **eyewitness** ˈaɪˌwɪt.nəs \n", "19. **firefighter** ˈfaɪrˌfaɪ.t̬ɚ \n", "20. **football** ˈfʊt.bɑːl \n", "21. **greenhouse** ˈɡriːn.haʊs \n", "22. **handwriting** ˈhændˌraɪ.t̬ɪŋ \n", "23. **headache** ˈhed.eɪk \n", "24. **highway** ˈhaɪ.weɪ \n", "25. **homework** ˈhoʊm.wɝːk \n", "26. **iceberg** ˈaɪs.bɝːɡ \n", "27. **jellyfish** ˈdʒel.i.fɪʃ \n", "28. **laptop** ˈlæp.tɑːp \n", "29. **lighthouse** ˈlaɪt.haʊs \n", "30. **mailbox** ˈmeɪl.bɑːks \n", "31. **moonlight** ˈmuːn.laɪt \n", "32. **notebook** ˈnoʊt.bʊk \n", "33. **nobody** ˈnoʊ.bɑː.di \n", "34. **pancake** ˈpæn.keɪk \n", "35. **postcard** ˈpoʊst.kɑːrd \n", "36. **rainbow** ˈreɪn.boʊ \n", "37. **sailboat** ˈseɪl.boʊt \n", "38. **sandbox** ˈsænd.bɑːks \n", "39. **seashore** ˈsiː.ʃɔːr \n", "40. **skateboard** ˈskeɪt.bɔːrd \n", "41. **snowflake** ˈsnoʊ.fleɪk \n", "42. **spaceship** ˈspeɪs.ʃɪp \n", "43. **sunflower** ˈsʌnˌflaʊ.ɚ \n", "44. **sunshine** ˈsʌn.ʃaɪn \n", "45. **superhero** ˈsuː.pɚˌhɪr.oʊ \n", "46. **tablecloth** ˈteɪ.bəl.klɑːθ \n", "47. **toothbrush** ˈtuːθ.brʌʃ \n", "48. **toothpaste** ˈtuːθ.peɪst \n", "49. **typewriter** ˈtaɪpˌraɪ.t̬ɚ \n", "50. **underwater** ˌʌn.dɚˈwɑː.t̬ɚ \n", "51. **upstairs** ʌpˈsterz \n", "52. **volleyball** ˈvɑː.li.bɑːl \n", "53. **waterfall** ˈwɑː.t̬ɚ.fɑːl \n", "54. **watermelon** ˈwɑː.t̬ɚˌmel.ən \n", "55. **weekend** ˈwiːk.end \n", "56. **wheelchair** ˈwiːl.tʃer \n", "57. **windmill** ˈwɪnd.mɪl \n", "58. **workshop** ˈwɝːk.ʃɑːp \n" ] } ], "source": [ "list = \"\"\"\n", "1. **airplane**\n", "2. **airport**\n", "3. **backyard**\n", "4. **bedroom**\n", "5. **birthday**\n", "6. **blackboard**\n", "7. **bookstore**\n", "8. **brainstorm**\n", "9. **breakfast**\n", "10. **classroom**\n", "11. **cupcake**\n", "12. **daydream**\n", "13. **dishwasher**\n", "14. **doorbell**\n", "15. **downtown**\n", "16. **earthquake**\n", "17. **everyday**\n", "18. **eyewitness**\n", "19. **firefighter**\n", "20. **football**\n", "21. **greenhouse**\n", "22. **handwriting**\n", "23. **headache**\n", "24. **highway**\n", "25. **homework**\n", "26. **iceberg**\n", "27. **jellyfish**\n", "28. **laptop**\n", "29. **lighthouse**\n", "30. **mailbox**\n", "31. **moonlight**\n", "32. **notebook**\n", "33. **nobody**\n", "34. **pancake**\n", "35. **postcard**\n", "36. **rainbow**\n", "37. **sailboat**\n", "38. **sandbox**\n", "39. **seashore**\n", "40. **skateboard**\n", "41. **snowflake**\n", "42. **spaceship**\n", "43. **sunflower**\n", "44. **sunshine**\n", "45. **superhero**\n", "46. **tablecloth**\n", "47. **toothbrush**\n", "48. **toothpaste**\n", "49. **typewriter**\n", "50. **underwater**\n", "51. **upstairs**\n", "52. **volleyball**\n", "53. **waterfall**\n", "54. **watermelon**\n", "55. **weekend**\n", "56. **wheelchair**\n", "57. **windmill**\n", "58. **workshop**\n", "\"\"\"\n", "\n", "lines = list.split(\"\\n\")\n", "for l in lines:\n", " if l.strip() == \"\":\n", " continue\n", " # extract str between ** and **\n", " word = re.search(r'\\*\\*(.*)\\*\\*', l).group(1)\n", "\n", " import asyncio\n", " import edge_tts\n", " import pygame\n", " for VOICE in ['en-US-GuyNeural', 'en-US-MichelleNeural']:\n", " # OUTPUT_FILE = f\"{w}-{VOICE.replace('EricNeural', 'Guy-Male').replace('JennyNeural', 'Jenny-Female').replace('RyanNeural', 'Ryan-Male').replace('SoniaNeural', 'Sonia-Female').lower()}.mp3\"\n", " OUTPUT_FILE = f\"{word}-{VOICE.replace('GuyNeural', 'Male').replace('MichelleNeural', 'Female').replace('en-', '').lower()}.mp3\"\n", " communicate = edge_tts.Communicate(word, VOICE)\n", " await communicate.save(OUTPUT_FILE) \n", " # print(f\"Audio files for {word} created!\") \n", "\n", " entry_us = search_in_json_database(json_database, word, 'us')\n", " if entry_us == 'not exist':\n", " phonetics = entry_us\n", " else:\n", " phonetics = entry_us['pronunciation'] \n", " wrapped_p = f' {phonetics} '\n", " l += wrapped_p\n", "\n", " print(l)" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 2 }