Files
everyone-can-use-english/1000-hours/public/jupyter-notebooks/check-media.ipynb
2024-08-22 17:37:48 +08:00

80 lines
3.3 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1980\n"
]
},
{
"ename": "TypeError",
"evalue": "can only concatenate list (not \"str\") to list",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[4], line 18\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m lines:\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudios/\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m line:\n\u001b[0;32m---> 18\u001b[0m audios_in_md \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfindall\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43maudios/(.*?).mp3\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mline\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m.mp3\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mlen\u001b[39m(audios_in_md))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;66;03m# remove duplicates\u001b[39;00m\n",
"\u001b[0;31mTypeError\u001b[0m: can only concatenate list (not \"str\") to list"
]
}
],
"source": [
"import os\n",
"md_path = \"../../sounds-of-american-english/\"\n",
"mp3_path = \"../audios/\"\n",
"# get md files in md_path\n",
"md_files = [f for f in os.listdir(md_path) if f.endswith('.md')]\n",
"# get all mp3 files in mp3_path\n",
"mp3_files = [f for f in os.listdir(mp3_path) if f.endswith('.mp3')]\n",
"print(len(mp3_files))\n",
"\n",
"# read md files, and get all sub-string between \"audios/\" and \".mp3\", using regex\n",
"import re\n",
"audios_in_md = []\n",
"for md_file in md_files:\n",
" with open(md_path + md_file, 'r') as f:\n",
" lines = f.readlines()\n",
" for line in lines:\n",
" if \"audios/\" in line:\n",
" audios_in_md += re.findall(r'audios/(.*?).mp3', line) + '.mp3'\n",
"print(len(audios_in_md))\n",
"# remove duplicates\n",
"audios_in_md = list(set(audios_in_md))\n",
"\n",
"for audio in audios_in_md:\n",
" if not f'{audio}.mp3' in mp3_files:\n",
" print(f'{audio}.mp3')\n",
"\n",
"\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.2"
}
},
"nbformat": 4,
"nbformat_minor": 2
}