Files
everyone-can-use-english/1000-hours/public/jupyter-notebooks/phonetics.ipynb
2024-08-22 17:36:27 +08:00

128 lines
4.7 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The history saving thread hit an unexpected error (DatabaseError('database disk image is malformed')).History will not be written to the database.\n"
]
}
],
"source": [
"import json\n",
"import vlc\n",
"import re\n",
"\n",
"# 假设你的 JSON 数据库是一个 JSON 文件,我们将从文件中加载数据\n",
"# 如果 JSON 数据在内存中或其他格式,你可能需要修改这部分代码\n",
"def load_json_database(file_path):\n",
" records = []\n",
" with open(file_path, 'r') as file:\n",
" for line in file:\n",
" try:\n",
" record = json.loads(line)\n",
" records.append(record)\n",
" except json.JSONDecodeError as e:\n",
" print(f\"Error parsing JSON: {e}\")\n",
" return records\n",
"\n",
"# The rest of the code remains the same...\n",
"\n",
"# 在 JSON 数据库中检索 word\n",
"def search_in_json_database(database, search_word, region):\n",
" for record in database:\n",
" # 检查 word 字段是否匹配\n",
" if record.get('word') == search_word:\n",
" # 找到匹配项后,获取美式发音信息\n",
" pos_items = record.get('pos_items', [])\n",
" for pos_item in pos_items:\n",
" pronunciations = pos_item.get('pronunciations', [])\n",
" for pronunciation in pronunciations:\n",
" if pronunciation.get('region') == region:\n",
" # 找到美式发音,返回相关信息\n",
" return {\n",
" 'pronunciation': pronunciation.get('pronunciation'),\n",
" 'audio': pronunciation.get('audio')\n",
" }\n",
" # 如果没有找到匹配的 word 字段,返回 'not exist'\n",
" return 'not exist'\n",
"\n",
"def search_pronunciation(database, pattern):\n",
" # Compile the regex pattern\n",
" regex = re.compile(pattern)\n",
" results = []\n",
" # Search in the database\n",
" for record in database:\n",
" for pos_item in record[\"pos_items\"]:\n",
" for pronunciation in pos_item[\"pronunciations\"]:\n",
" if regex.search(pronunciation[\"pronunciation\"]):\n",
" if pronunciation.get('region') == 'us':\n",
" results.append((record[\"word\"], pronunciation[\"pronunciation\"].replace(\".\", \"\"))) # record[\"_id\"][\"$oid\"],\n",
" # Return None if no match found\n",
" return results\n",
"\n",
"# 用于测试的 JSON 数据库文件路径\n",
"json_db_file_path = '/Users/joker/github/camdict/cam_dict.refined.json'\n",
"\n",
"json_database = load_json_database(json_db_file_path)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"*balls* <span class=\"pho alt\">bɑːlz</span><span class=\"speak-word-inline\" data-audio-us-male=\"/audios/ballss-us-male.mp3\" data-audio-us-female=\"/audios/ballss-us-female.mp3\"></span>\n",
"*cards* <span class=\"pho alt\">kɑrdz</span><span class=\"speak-word-inline\" data-audio-us-male=\"/audios/cardss-us-male.mp3\" data-audio-us-female=\"/audios/cardss-us-female.mp3\"></span>\n"
]
}
],
"source": [
"list = \"\"\"\n",
"balls,cards\n",
"\"\"\"\n",
"\n",
"for word in list.split(\",\"):\n",
" word = word.strip().lower()\n",
" result = search_in_json_database(json_database, word, 'us')\n",
" if result != 'not exist':\n",
" pho = result['pronunciation']\n",
" else:\n",
" pho = 'not exist'\n",
" line = f'*{word}* <span class=\"pho alt\">{pho}</span><span class=\"speak-word-inline\" data-audio-us-male=\"/audios/{word}s-us-male.mp3\" data-audio-us-female=\"/audios/{word}s-us-female.mp3\"></span>'\n",
" print(line)\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}