everyone-can-use-english/1000-hours/public/jupyter-notebooks/check-media.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1980\n"
     ]
    },
    {
     "ename": "TypeError",
     "evalue": "can only concatenate list (not \"str\") to list",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[4], line 18\u001b[0m\n\u001b[1;32m     16\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m line \u001b[38;5;129;01min\u001b[39;00m lines:\n\u001b[1;32m     17\u001b[0m             \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124maudios/\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m line:\n\u001b[0;32m---> 18\u001b[0m                 audios_in_md \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[43mre\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfindall\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43mr\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43maudios/(.*?).mp3\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mline\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m+\u001b[39;49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43m.mp3\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\n\u001b[1;32m     19\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mlen\u001b[39m(audios_in_md))\n\u001b[1;32m     20\u001b[0m \u001b[38;5;66;03m# remove duplicates\u001b[39;00m\n",
      "\u001b[0;31mTypeError\u001b[0m: can only concatenate list (not \"str\") to list"
     ]
    }
   ],
   "source": [
    "import os\n",
    "md_path = \"../../sounds-of-american-english/\"\n",
    "mp3_path = \"../audios/\"\n",
    "# get md files in md_path\n",
    "md_files = [f for f in os.listdir(md_path) if f.endswith('.md')]\n",
    "# get all mp3 files in mp3_path\n",
    "mp3_files = [f for f in os.listdir(mp3_path) if f.endswith('.mp3')]\n",
    "print(len(mp3_files))\n",
    "\n",
    "# read md files, and get all sub-string between \"audios/\" and \".mp3\", using regex\n",
    "import re\n",
    "audios_in_md = []\n",
    "for md_file in md_files:\n",
    "    with open(md_path + md_file, 'r') as f:\n",
    "        lines = f.readlines()\n",
    "        for line in lines:\n",
    "            if \"audios/\" in line:\n",
    "                audios_in_md += re.findall(r'audios/(.*?).mp3', line) + '.mp3'\n",
    "print(len(audios_in_md))\n",
    "# remove duplicates\n",
    "audios_in_md = list(set(audios_in_md))\n",
    "\n",
    "for audio in audios_in_md:\n",
    "    if not f'{audio}.mp3' in mp3_files:\n",
    "        print(f'{audio}.mp3')\n",
    "\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}