80 lines
3.3 KiB
Plaintext
80 lines
3.3 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"name": "stdout",
|
|
"output_type": "stream",
|
|
"text": [
|
|
"1980\n"
|
|
]
|
|
},
|
|
{
|
|
"ename": "TypeError",
|
|
"evalue": "can only concatenate list (not \"str\") to list",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[4], line 18\u001b[0m\n\u001b[1;32m 16\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m lines:\n\u001b[1;32m 17\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudios/\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m line:\n\u001b[0;32m---> 18\u001b[0m audios_in_md \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfindall\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43maudios/(.*?).mp3\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mline\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m.mp3\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\n\u001b[1;32m 19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mlen\u001b[39m(audios_in_md))\n\u001b[1;32m 20\u001b[0m \u001b[38;5;66;03m# remove duplicates\u001b[39;00m\n",
|
|
"\u001b[0;31mTypeError\u001b[0m: can only concatenate list (not \"str\") to list"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"import os\n",
|
|
"md_path = \"../../sounds-of-american-english/\"\n",
|
|
"mp3_path = \"../audios/\"\n",
|
|
"# get md files in md_path\n",
|
|
"md_files = [f for f in os.listdir(md_path) if f.endswith('.md')]\n",
|
|
"# get all mp3 files in mp3_path\n",
|
|
"mp3_files = [f for f in os.listdir(mp3_path) if f.endswith('.mp3')]\n",
|
|
"print(len(mp3_files))\n",
|
|
"\n",
|
|
"# read md files, and get all sub-string between \"audios/\" and \".mp3\", using regex\n",
|
|
"import re\n",
|
|
"audios_in_md = []\n",
|
|
"for md_file in md_files:\n",
|
|
" with open(md_path + md_file, 'r') as f:\n",
|
|
" lines = f.readlines()\n",
|
|
" for line in lines:\n",
|
|
" if \"audios/\" in line:\n",
|
|
" audios_in_md += re.findall(r'audios/(.*?).mp3', line) + '.mp3'\n",
|
|
"print(len(audios_in_md))\n",
|
|
"# remove duplicates\n",
|
|
"audios_in_md = list(set(audios_in_md))\n",
|
|
"\n",
|
|
"for audio in audios_in_md:\n",
|
|
" if not f'{audio}.mp3' in mp3_files:\n",
|
|
" print(f'{audio}.mp3')\n",
|
|
"\n",
|
|
"\n"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "base",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.2"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 2
|
|
}
|