{ "cells": [ { "cell_type": "markdown", "source": [ "# 爬取定投课堂资料\n", "通过解析课程json文件,获取对应资源的url从而下载所有资料\n", "获取课程id" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 2, "metadata": { "collapsed": true, "ExecuteTime": { "end_time": "2023-07-05T10:56:39.054587Z", "start_time": "2023-07-05T10:56:34.185665600Z" } }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\75254\\PycharmProjects\\dt_audio_down\\venv\\Lib\\site-packages\\pydub\\utils.py:198: RuntimeWarning: Couldn't find ffprobe or avprobe - defaulting to ffprobe, but may not work\n", " warn(\"Couldn't find ffprobe or avprobe - defaulting to ffprobe, but may not work\", RuntimeWarning)\n" ] }, { "ename": "FileNotFoundError", "evalue": "[WinError 2] 系统找不到指定的文件。", "output_type": "error", "traceback": [ "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m", "\u001B[1;31mFileNotFoundError\u001B[0m Traceback (most recent call last)", "Cell \u001B[1;32mIn[2], line 74\u001B[0m\n\u001B[0;32m 72\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtemp.mp3\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mwb\u001B[39m\u001B[38;5;124m'\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m file:\n\u001B[0;32m 73\u001B[0m file\u001B[38;5;241m.\u001B[39mwrite(i)\n\u001B[1;32m---> 74\u001B[0m audio_part \u001B[38;5;241m=\u001B[39m \u001B[43mAudioSegment\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfrom_mp3\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mtemp.mp3\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[0;32m 75\u001B[0m \u001B[38;5;66;03m# audio_part = AudioSegment.from_mp3(i)\u001B[39;00m\n\u001B[0;32m 76\u001B[0m audio_seg_list\u001B[38;5;241m.\u001B[39mappend(audio_part)\n", "File \u001B[1;32m~\\PycharmProjects\\dt_audio_down\\venv\\Lib\\site-packages\\pydub\\audio_segment.py:796\u001B[0m, in \u001B[0;36mAudioSegment.from_mp3\u001B[1;34m(cls, file, parameters)\u001B[0m\n\u001B[0;32m 794\u001B[0m \u001B[38;5;129m@classmethod\u001B[39m\n\u001B[0;32m 795\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfrom_mp3\u001B[39m(\u001B[38;5;28mcls\u001B[39m, file, parameters\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m):\n\u001B[1;32m--> 796\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mcls\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfrom_file\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfile\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mmp3\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mparameters\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mparameters\u001B[49m\u001B[43m)\u001B[49m\n", "File \u001B[1;32m~\\PycharmProjects\\dt_audio_down\\venv\\Lib\\site-packages\\pydub\\audio_segment.py:728\u001B[0m, in \u001B[0;36mAudioSegment.from_file\u001B[1;34m(cls, file, format, codec, parameters, start_second, duration, **kwargs)\u001B[0m\n\u001B[0;32m 726\u001B[0m info \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m 727\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m--> 728\u001B[0m info \u001B[38;5;241m=\u001B[39m \u001B[43mmediainfo_json\u001B[49m\u001B[43m(\u001B[49m\u001B[43morig_file\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mread_ahead_limit\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mread_ahead_limit\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 729\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m info:\n\u001B[0;32m 730\u001B[0m audio_streams \u001B[38;5;241m=\u001B[39m [x \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m info[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mstreams\u001B[39m\u001B[38;5;124m'\u001B[39m]\n\u001B[0;32m 731\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m x[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcodec_type\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124maudio\u001B[39m\u001B[38;5;124m'\u001B[39m]\n", "File \u001B[1;32m~\\PycharmProjects\\dt_audio_down\\venv\\Lib\\site-packages\\pydub\\utils.py:274\u001B[0m, in \u001B[0;36mmediainfo_json\u001B[1;34m(filepath, read_ahead_limit)\u001B[0m\n\u001B[0;32m 271\u001B[0m file\u001B[38;5;241m.\u001B[39mclose()\n\u001B[0;32m 273\u001B[0m command \u001B[38;5;241m=\u001B[39m [prober, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m-of\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mjson\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m+\u001B[39m command_args\n\u001B[1;32m--> 274\u001B[0m res \u001B[38;5;241m=\u001B[39m \u001B[43mPopen\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcommand\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstdin\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mstdin_parameter\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstdout\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mPIPE\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstderr\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mPIPE\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 275\u001B[0m output, stderr \u001B[38;5;241m=\u001B[39m res\u001B[38;5;241m.\u001B[39mcommunicate(\u001B[38;5;28minput\u001B[39m\u001B[38;5;241m=\u001B[39mstdin_data)\n\u001B[0;32m 276\u001B[0m output \u001B[38;5;241m=\u001B[39m output\u001B[38;5;241m.\u001B[39mdecode(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mutf-8\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mignore\u001B[39m\u001B[38;5;124m'\u001B[39m)\n", "File \u001B[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.1264.0_x64__qbz5n2kfra8p0\\Lib\\subprocess.py:1026\u001B[0m, in \u001B[0;36mPopen.__init__\u001B[1;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group)\u001B[0m\n\u001B[0;32m 1022\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtext_mode:\n\u001B[0;32m 1023\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstderr \u001B[38;5;241m=\u001B[39m io\u001B[38;5;241m.\u001B[39mTextIOWrapper(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstderr,\n\u001B[0;32m 1024\u001B[0m encoding\u001B[38;5;241m=\u001B[39mencoding, errors\u001B[38;5;241m=\u001B[39merrors)\n\u001B[1;32m-> 1026\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_execute_child\u001B[49m\u001B[43m(\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mexecutable\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mpreexec_fn\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mclose_fds\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1027\u001B[0m \u001B[43m \u001B[49m\u001B[43mpass_fds\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcwd\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43menv\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1028\u001B[0m \u001B[43m \u001B[49m\u001B[43mstartupinfo\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcreationflags\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mshell\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1029\u001B[0m \u001B[43m \u001B[49m\u001B[43mp2cread\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mp2cwrite\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1030\u001B[0m \u001B[43m \u001B[49m\u001B[43mc2pread\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mc2pwrite\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1031\u001B[0m \u001B[43m \u001B[49m\u001B[43merrread\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43merrwrite\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1032\u001B[0m \u001B[43m \u001B[49m\u001B[43mrestore_signals\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1033\u001B[0m \u001B[43m \u001B[49m\u001B[43mgid\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgids\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43muid\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mumask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1034\u001B[0m \u001B[43m \u001B[49m\u001B[43mstart_new_session\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mprocess_group\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 1035\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m:\n\u001B[0;32m 1036\u001B[0m \u001B[38;5;66;03m# Cleanup if the child failed starting.\u001B[39;00m\n\u001B[0;32m 1037\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m f \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mfilter\u001B[39m(\u001B[38;5;28;01mNone\u001B[39;00m, (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstdin, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstdout, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstderr)):\n", "File \u001B[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.1264.0_x64__qbz5n2kfra8p0\\Lib\\subprocess.py:1538\u001B[0m, in \u001B[0;36mPopen._execute_child\u001B[1;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group)\u001B[0m\n\u001B[0;32m 1536\u001B[0m \u001B[38;5;66;03m# Start the process\u001B[39;00m\n\u001B[0;32m 1537\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 1538\u001B[0m hp, ht, pid, tid \u001B[38;5;241m=\u001B[39m \u001B[43m_winapi\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mCreateProcess\u001B[49m\u001B[43m(\u001B[49m\u001B[43mexecutable\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1539\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;66;43;03m# no special security\u001B[39;49;00m\n\u001B[0;32m 1540\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m,\u001B[49m\n\u001B[0;32m 1541\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;28;43mint\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mclose_fds\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1542\u001B[0m \u001B[43m \u001B[49m\u001B[43mcreationflags\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1543\u001B[0m \u001B[43m \u001B[49m\u001B[43menv\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1544\u001B[0m \u001B[43m \u001B[49m\u001B[43mcwd\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1545\u001B[0m \u001B[43m \u001B[49m\u001B[43mstartupinfo\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 1546\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[0;32m 1547\u001B[0m \u001B[38;5;66;03m# Child is launched. Close the parent's copy of those pipe\u001B[39;00m\n\u001B[0;32m 1548\u001B[0m \u001B[38;5;66;03m# handles that only the child should have open. You need\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 1551\u001B[0m \u001B[38;5;66;03m# pipe will not close when the child process exits and the\u001B[39;00m\n\u001B[0;32m 1552\u001B[0m \u001B[38;5;66;03m# ReadFile will hang.\u001B[39;00m\n\u001B[0;32m 1553\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_close_pipe_fds(p2cread, p2cwrite,\n\u001B[0;32m 1554\u001B[0m c2pread, c2pwrite,\n\u001B[0;32m 1555\u001B[0m errread, errwrite)\n", "\u001B[1;31mFileNotFoundError\u001B[0m: [WinError 2] 系统找不到指定的文件。" ] } ], "source": [ "import json\n", "import time\n", "\n", "import requests\n", "from IPython.display import Audio,display\n", "from pydub import AudioSegment\n", "import os\n", "import random\n", "\n", "with open('audio.json', encoding=\"UTF-8\") as f:\n", " audios = json.load(f)\n", "# print(type(audios[1])) # Output: dict\n", "# TODO: 优化代码\n", "\n", "course_id = str(audios[1]['course_id'])\n", "os.makedirs(course_id, exist_ok=True)\n", "\n", "# 遍历\n", "audio_list = []\n", "for audio in audios:\n", " # t = random.randint(1, 10)\n", " # time.sleep(1)\n", " category = audio['category']\n", " # requests.adapters.DEFAULT_RETRIES = 100\n", " # 获取音频内容\n", " i = 1\n", " while(i >= 1):\n", " try:\n", " if category == \"PLAIN_AUDIO\":\n", " url = audio['attachment']['url']\n", " # print(url)\n", " audio = requests.get(url)\n", " # print(audio.status_code)\n", " # display(Audio(audio.content))\n", " audio_list.append(audio.content)\n", " # print(audio['attachment']['url'])\n", " # filename = os.path.basename(url)\n", " # with open(os.path.join(course_id, filename), 'wb') as file:\n", " # file.write(audio.content)\n", " # 获取文本笔记\n", " elif category == \"PLAIN_TEXT\":\n", " text = audio['text']\n", " print(text)\n", " with open(os.path.join(course_id, course_id + 'note.txt'), 'a') as file:\n", " file.write(text)\n", " file.write(\"\\n\")\n", " # 获取其他可下载附件\n", " elif category != \"MESSAGE_RECALL\":\n", " # print(audio['category'])\n", " if 'attachment' in audio:\n", " url = audio['attachment']['url']\n", " # print(audio['attachment']['url'])\n", " attachment = requests.get(url)\n", " filename = os.path.basename(url)\n", " with open(os.path.join(course_id, filename), 'wb') as file:\n", " file.write(attachment.content)\n", " # 获取其他内容\n", " else:\n", " print(audio['category'])\n", " if 'attachment' in audio:\n", " print(audio['attachment']['url'])\n", " i = 0\n", " except:\n", " i += 1\n", " print(\"get file failed\")\n", " if 'attachment' in audio:\n", " print(audio['attachment']['url'])\n", "\n", "# 处理获取所有音频文件\n", "audio_seg_list = []\n", "for i in audio_list:\n", " with open('temp.mp3', 'wb') as file:\n", " file.write(i)\n", " audio_part = AudioSegment.from_mp3('temp.mp3')\n", " # audio_part = AudioSegment.from_mp3(i)\n", " audio_seg_list.append(audio_part)\n", "\n", "# print(len(audio_seg_list))\n", "x = sum(audio_seg_list)\n", "\n", "# 递归求和\n", "# def sumOfList(list, size):\n", "# if (size == 0):\n", "# return 0\n", "# else:\n", "# return list[size - 1] + sumOfList(list, size - 1)\n", "#\n", "# total = sumOfList(list1, len(list1))\n", "\n", "audio_name = course_id + '/' + course_id + '.mp3'\n", "\n", "x.export(audio_name, format=\"mp3\")\n", "\n", "display(Audio(audio_name))" ] }, { "cell_type": "markdown", "source": [ "## 获取图片url" ], "metadata": { "collapsed": false } }, { "cell_type": "code", "execution_count": 1, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "C:\\Users\\75254\\PycharmProjects\\dt_audio_down\\venv\\Lib\\site-packages\\pydub\\utils.py:170: RuntimeWarning: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work\n", " warn(\"Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work\", RuntimeWarning)\n" ] } ], "source": [ "# get all url and text\n", "import json\n", "import requests\n", "from IPython.display import Audio,display\n", "from pydub import AudioSegment\n", "import os\n", "\n", "# get pic url list\n", "\n", "\n", "with open('audio.json', encoding= \"UTF-8\") as f:\n", " audios = json.load(f)\n", "# print(type(audios[1])) # Output: dict\n", "\n", "course_id = str(audios[1]['course_id'])\n", "# os.makedirs(course_id, exist_ok=True)\n", "\n", "audio_list = []\n", "for audio in audios:\n", " category = audio['category']\n", " if category == \"PLAIN_IMAGE\":\n", " url = audio['attachment']['url']\n", " print(url)\n", " # with open( course_id + '_pic_url.txt', 'a') as file:\n", " # file.write(url)\n", " # file.write(\"\\n\")" ], "metadata": { "collapsed": false, "ExecuteTime": { "end_time": "2023-07-05T10:56:34.184660400Z", "start_time": "2023-07-05T10:56:34.073384900Z" } } }, { "cell_type": "code", "execution_count": null, "outputs": [], "source": [], "metadata": { "collapsed": false } } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 2 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython2", "version": "2.7.6" } }, "nbformat": 4, "nbformat_minor": 0 }