Files
everyone-can-use-english/1000-hours/public/jupyter-notebooks/check-media.ipynb
2024-08-23 19:43:40 +08:00

222 lines
4.4 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"bear,\n",
"steer,\n",
"velour,\n",
"house,\n",
"English,\n",
"day,\n",
"reign,\n",
"English,\n",
"show,\n",
"eight,\n",
"buy,\n",
"reign,\n",
"dough,\n",
"play,\n",
"cheer,\n",
"pier,\n",
"grow,\n",
"break,\n",
"so,\n",
"June,\n",
"bear,\n",
"pair,\n",
"so,\n",
"toe,\n",
"cheer,\n",
"June,\n",
"supply,\n",
"cow,\n",
"out,\n",
"fierce,\n",
"moor,\n",
"align,\n",
"show,\n",
"join,\n",
"flower,\n",
"buy,\n",
"boat,\n",
"survey,\n",
"no,\n",
"cure,\n",
"survey,\n",
"boy,\n",
"fear,\n",
"make,\n",
"though,\n",
"brown,\n",
"velour,\n",
"moor,\n",
"grow,\n",
"near,\n",
"care,\n",
"eight,\n",
"pat,\n",
"blow,\n",
"play,\n",
"weight,\n",
"lies,\n",
"make,\n",
"fare,\n",
"spider,\n",
"pair,\n",
"pier,\n",
"though,\n",
"light,\n",
"out,\n",
"grey,\n",
"table,\n",
"supply,\n",
"steak,\n",
"fair,\n",
"vein,\n",
"fair,\n",
"paint,\n",
"cake,\n",
"blow,\n",
"they,\n",
"stay,\n",
"cure,\n",
"spider,\n",
"sew,\n",
"train,\n",
"great,\n",
"stay,\n",
"deer,\n",
"break,\n",
"guy,\n",
"Joe,\n",
"weight,\n",
"steer,\n",
"align,\n",
"dough,\n",
"boat,\n",
"toe,\n",
"kraut,\n",
"train,\n",
"great,\n",
"boy,\n",
"kraut,\n",
"deer,\n",
"ware,\n",
"rain,\n",
"grey,\n",
"tour,\n",
"toy,\n",
"near,\n",
"cow,\n",
"join,\n",
"lies,\n",
"table,\n",
"word,\n",
"toy,\n",
"rain,\n",
"clear,\n",
"ice,\n",
"ice,\n",
"fly,\n",
"fear,\n",
"fly,\n",
"serendipity,\n",
"care,\n",
"steak,\n",
"paint,\n",
"no,\n",
"vein,\n",
"tour,\n",
"clear,\n",
"soap,\n",
"pie,\n",
"cake,\n",
"brown,\n",
"sew,\n",
"fierce,\n",
"light,\n",
"fare,\n",
"pie,\n",
"pat,\n",
"Joe,\n",
"foe,\n",
"house,\n",
"tie,\n",
"word,\n",
"flower,\n",
"day,\n",
"they,\n",
"soap,\n",
"guy,\n",
"foe,\n",
"ware,\n",
"tie,\n",
"serendipity,\n"
]
}
],
"source": [
"import os\n",
"md_path = \"../../sounds-of-american-english/\"\n",
"mp3_path = \"../audios/\"\n",
"# get md files in md_path\n",
"md_files = [f for f in os.listdir(md_path) if f.endswith('.md')]\n",
"# get all mp3 files in mp3_path\n",
"mp3_files = [f for f in os.listdir(mp3_path) if f.endswith('.mp3')]\n",
"# print(len(mp3_files))\n",
"\n",
"# read md files, and get all sub-string between \"audios/\" and \".mp3\", using regex\n",
"import re\n",
"audios_in_md = []\n",
"for md_file in md_files:\n",
" with open(md_path + md_file, 'r') as f:\n",
" lines = f.readlines()\n",
" for line in lines:\n",
" if \"audios/\" in line:\n",
" audios_in_md += re.findall(r'audios/(.*?).mp3', line)\n",
"\n",
"# remove duplicates\n",
"audios_in_md = list(set(audios_in_md))\n",
"# print(len(audios_in_md))\n",
"\n",
"for audio in audios_in_md:\n",
" if not f'{audio}.mp3' in mp3_files:\n",
" print(f'{audio.split(\"-\")[0].strip()},')\n",
"\n",
"# for mp3 in mp3_files:\n",
"# if not mp3.replace('.mp3', '') in audios_in_md:\n",
"# print(mp3)\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}