everyone-can-use-english/1000-hours/public/jupyter-notebooks/check-media.ipynb

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "bear,\n",
      "steer,\n",
      "velour,\n",
      "house,\n",
      "English,\n",
      "day,\n",
      "reign,\n",
      "English,\n",
      "show,\n",
      "eight,\n",
      "buy,\n",
      "reign,\n",
      "dough,\n",
      "play,\n",
      "cheer,\n",
      "pier,\n",
      "grow,\n",
      "break,\n",
      "so,\n",
      "June,\n",
      "bear,\n",
      "pair,\n",
      "so,\n",
      "toe,\n",
      "cheer,\n",
      "June,\n",
      "supply,\n",
      "cow,\n",
      "out,\n",
      "fierce,\n",
      "moor,\n",
      "align,\n",
      "show,\n",
      "join,\n",
      "flower,\n",
      "buy,\n",
      "boat,\n",
      "survey,\n",
      "no,\n",
      "cure,\n",
      "survey,\n",
      "boy,\n",
      "fear,\n",
      "make,\n",
      "though,\n",
      "brown,\n",
      "velour,\n",
      "moor,\n",
      "grow,\n",
      "near,\n",
      "care,\n",
      "eight,\n",
      "pat,\n",
      "blow,\n",
      "play,\n",
      "weight,\n",
      "lies,\n",
      "make,\n",
      "fare,\n",
      "spider,\n",
      "pair,\n",
      "pier,\n",
      "though,\n",
      "light,\n",
      "out,\n",
      "grey,\n",
      "table,\n",
      "supply,\n",
      "steak,\n",
      "fair,\n",
      "vein,\n",
      "fair,\n",
      "paint,\n",
      "cake,\n",
      "blow,\n",
      "they,\n",
      "stay,\n",
      "cure,\n",
      "spider,\n",
      "sew,\n",
      "train,\n",
      "great,\n",
      "stay,\n",
      "deer,\n",
      "break,\n",
      "guy,\n",
      "Joe,\n",
      "weight,\n",
      "steer,\n",
      "align,\n",
      "dough,\n",
      "boat,\n",
      "toe,\n",
      "kraut,\n",
      "train,\n",
      "great,\n",
      "boy,\n",
      "kraut,\n",
      "deer,\n",
      "ware,\n",
      "rain,\n",
      "grey,\n",
      "tour,\n",
      "toy,\n",
      "near,\n",
      "cow,\n",
      "join,\n",
      "lies,\n",
      "table,\n",
      "word,\n",
      "toy,\n",
      "rain,\n",
      "clear,\n",
      "ice,\n",
      "ice,\n",
      "fly,\n",
      "fear,\n",
      "fly,\n",
      "serendipity,\n",
      "care,\n",
      "steak,\n",
      "paint,\n",
      "no,\n",
      "vein,\n",
      "tour,\n",
      "clear,\n",
      "soap,\n",
      "pie,\n",
      "cake,\n",
      "brown,\n",
      "sew,\n",
      "fierce,\n",
      "light,\n",
      "fare,\n",
      "pie,\n",
      "pat,\n",
      "Joe,\n",
      "foe,\n",
      "house,\n",
      "tie,\n",
      "word,\n",
      "flower,\n",
      "day,\n",
      "they,\n",
      "soap,\n",
      "guy,\n",
      "foe,\n",
      "ware,\n",
      "tie,\n",
      "serendipity,\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "md_path = \"../../sounds-of-american-english/\"\n",
    "mp3_path = \"../audios/\"\n",
    "# get md files in md_path\n",
    "md_files = [f for f in os.listdir(md_path) if f.endswith('.md')]\n",
    "# get all mp3 files in mp3_path\n",
    "mp3_files = [f for f in os.listdir(mp3_path) if f.endswith('.mp3')]\n",
    "# print(len(mp3_files))\n",
    "\n",
    "# read md files, and get all sub-string between \"audios/\" and \".mp3\", using regex\n",
    "import re\n",
    "audios_in_md = []\n",
    "for md_file in md_files:\n",
    "    with open(md_path + md_file, 'r') as f:\n",
    "        lines = f.readlines()\n",
    "        for line in lines:\n",
    "            if \"audios/\" in line:\n",
    "                audios_in_md += re.findall(r'audios/(.*?).mp3', line)\n",
    "\n",
    "# remove duplicates\n",
    "audios_in_md = list(set(audios_in_md))\n",
    "# print(len(audios_in_md))\n",
    "\n",
    "for audio in audios_in_md:\n",
    "    if not f'{audio}.mp3' in mp3_files:\n",
    "        print(f'{audio.split(\"-\")[0].strip()},')\n",
    "\n",
    "# for mp3 in mp3_files:\n",
    "#     if not mp3.replace('.mp3', '') in audios_in_md:\n",
    "#         print(mp3)\n",
    "\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}