Files
dt_audio/json_parser.ipynb
Lostecho a0825003e7 update
2023-07-13 10:09:11 +08:00

237 lines
19 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
{
"cells": [
{
"cell_type": "markdown",
"source": [
"# 爬取定投课堂资料\n",
"通过解析课程json文件获取对应资源的url从而下载所有资料\n",
"获取课程id"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": true,
"ExecuteTime": {
"end_time": "2023-07-05T10:56:39.054587Z",
"start_time": "2023-07-05T10:56:34.185665600Z"
}
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\75254\\PycharmProjects\\dt_audio_down\\venv\\Lib\\site-packages\\pydub\\utils.py:198: RuntimeWarning: Couldn't find ffprobe or avprobe - defaulting to ffprobe, but may not work\n",
" warn(\"Couldn't find ffprobe or avprobe - defaulting to ffprobe, but may not work\", RuntimeWarning)\n"
]
},
{
"ename": "FileNotFoundError",
"evalue": "[WinError 2] 系统找不到指定的文件。",
"output_type": "error",
"traceback": [
"\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
"\u001B[1;31mFileNotFoundError\u001B[0m Traceback (most recent call last)",
"Cell \u001B[1;32mIn[2], line 74\u001B[0m\n\u001B[0;32m 72\u001B[0m \u001B[38;5;28;01mwith\u001B[39;00m \u001B[38;5;28mopen\u001B[39m(\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mtemp.mp3\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mwb\u001B[39m\u001B[38;5;124m'\u001B[39m) \u001B[38;5;28;01mas\u001B[39;00m file:\n\u001B[0;32m 73\u001B[0m file\u001B[38;5;241m.\u001B[39mwrite(i)\n\u001B[1;32m---> 74\u001B[0m audio_part \u001B[38;5;241m=\u001B[39m \u001B[43mAudioSegment\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfrom_mp3\u001B[49m\u001B[43m(\u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mtemp.mp3\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m)\u001B[49m\n\u001B[0;32m 75\u001B[0m \u001B[38;5;66;03m# audio_part = AudioSegment.from_mp3(i)\u001B[39;00m\n\u001B[0;32m 76\u001B[0m audio_seg_list\u001B[38;5;241m.\u001B[39mappend(audio_part)\n",
"File \u001B[1;32m~\\PycharmProjects\\dt_audio_down\\venv\\Lib\\site-packages\\pydub\\audio_segment.py:796\u001B[0m, in \u001B[0;36mAudioSegment.from_mp3\u001B[1;34m(cls, file, parameters)\u001B[0m\n\u001B[0;32m 794\u001B[0m \u001B[38;5;129m@classmethod\u001B[39m\n\u001B[0;32m 795\u001B[0m \u001B[38;5;28;01mdef\u001B[39;00m \u001B[38;5;21mfrom_mp3\u001B[39m(\u001B[38;5;28mcls\u001B[39m, file, parameters\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mNone\u001B[39;00m):\n\u001B[1;32m--> 796\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28;43mcls\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mfrom_file\u001B[49m\u001B[43m(\u001B[49m\u001B[43mfile\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[38;5;124;43mmp3\u001B[39;49m\u001B[38;5;124;43m'\u001B[39;49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mparameters\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mparameters\u001B[49m\u001B[43m)\u001B[49m\n",
"File \u001B[1;32m~\\PycharmProjects\\dt_audio_down\\venv\\Lib\\site-packages\\pydub\\audio_segment.py:728\u001B[0m, in \u001B[0;36mAudioSegment.from_file\u001B[1;34m(cls, file, format, codec, parameters, start_second, duration, **kwargs)\u001B[0m\n\u001B[0;32m 726\u001B[0m info \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[0;32m 727\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m--> 728\u001B[0m info \u001B[38;5;241m=\u001B[39m \u001B[43mmediainfo_json\u001B[49m\u001B[43m(\u001B[49m\u001B[43morig_file\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mread_ahead_limit\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mread_ahead_limit\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 729\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m info:\n\u001B[0;32m 730\u001B[0m audio_streams \u001B[38;5;241m=\u001B[39m [x \u001B[38;5;28;01mfor\u001B[39;00m x \u001B[38;5;129;01min\u001B[39;00m info[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mstreams\u001B[39m\u001B[38;5;124m'\u001B[39m]\n\u001B[0;32m 731\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m x[\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mcodec_type\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m==\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124maudio\u001B[39m\u001B[38;5;124m'\u001B[39m]\n",
"File \u001B[1;32m~\\PycharmProjects\\dt_audio_down\\venv\\Lib\\site-packages\\pydub\\utils.py:274\u001B[0m, in \u001B[0;36mmediainfo_json\u001B[1;34m(filepath, read_ahead_limit)\u001B[0m\n\u001B[0;32m 271\u001B[0m file\u001B[38;5;241m.\u001B[39mclose()\n\u001B[0;32m 273\u001B[0m command \u001B[38;5;241m=\u001B[39m [prober, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124m-of\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mjson\u001B[39m\u001B[38;5;124m'\u001B[39m] \u001B[38;5;241m+\u001B[39m command_args\n\u001B[1;32m--> 274\u001B[0m res \u001B[38;5;241m=\u001B[39m \u001B[43mPopen\u001B[49m\u001B[43m(\u001B[49m\u001B[43mcommand\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstdin\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mstdin_parameter\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstdout\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mPIPE\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mstderr\u001B[49m\u001B[38;5;241;43m=\u001B[39;49m\u001B[43mPIPE\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 275\u001B[0m output, stderr \u001B[38;5;241m=\u001B[39m res\u001B[38;5;241m.\u001B[39mcommunicate(\u001B[38;5;28minput\u001B[39m\u001B[38;5;241m=\u001B[39mstdin_data)\n\u001B[0;32m 276\u001B[0m output \u001B[38;5;241m=\u001B[39m output\u001B[38;5;241m.\u001B[39mdecode(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mutf-8\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mignore\u001B[39m\u001B[38;5;124m'\u001B[39m)\n",
"File \u001B[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.1264.0_x64__qbz5n2kfra8p0\\Lib\\subprocess.py:1026\u001B[0m, in \u001B[0;36mPopen.__init__\u001B[1;34m(self, args, bufsize, executable, stdin, stdout, stderr, preexec_fn, close_fds, shell, cwd, env, universal_newlines, startupinfo, creationflags, restore_signals, start_new_session, pass_fds, user, group, extra_groups, encoding, errors, text, umask, pipesize, process_group)\u001B[0m\n\u001B[0;32m 1022\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtext_mode:\n\u001B[0;32m 1023\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstderr \u001B[38;5;241m=\u001B[39m io\u001B[38;5;241m.\u001B[39mTextIOWrapper(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstderr,\n\u001B[0;32m 1024\u001B[0m encoding\u001B[38;5;241m=\u001B[39mencoding, errors\u001B[38;5;241m=\u001B[39merrors)\n\u001B[1;32m-> 1026\u001B[0m \u001B[38;5;28;43mself\u001B[39;49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43m_execute_child\u001B[49m\u001B[43m(\u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mexecutable\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mpreexec_fn\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mclose_fds\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1027\u001B[0m \u001B[43m \u001B[49m\u001B[43mpass_fds\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcwd\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43menv\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1028\u001B[0m \u001B[43m \u001B[49m\u001B[43mstartupinfo\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mcreationflags\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mshell\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1029\u001B[0m \u001B[43m \u001B[49m\u001B[43mp2cread\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mp2cwrite\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1030\u001B[0m \u001B[43m \u001B[49m\u001B[43mc2pread\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mc2pwrite\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1031\u001B[0m \u001B[43m \u001B[49m\u001B[43merrread\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43merrwrite\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1032\u001B[0m \u001B[43m \u001B[49m\u001B[43mrestore_signals\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1033\u001B[0m \u001B[43m \u001B[49m\u001B[43mgid\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mgids\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43muid\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mumask\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1034\u001B[0m \u001B[43m \u001B[49m\u001B[43mstart_new_session\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43mprocess_group\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 1035\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m:\n\u001B[0;32m 1036\u001B[0m \u001B[38;5;66;03m# Cleanup if the child failed starting.\u001B[39;00m\n\u001B[0;32m 1037\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m f \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mfilter\u001B[39m(\u001B[38;5;28;01mNone\u001B[39;00m, (\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstdin, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstdout, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mstderr)):\n",
"File \u001B[1;32mC:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.11_3.11.1264.0_x64__qbz5n2kfra8p0\\Lib\\subprocess.py:1538\u001B[0m, in \u001B[0;36mPopen._execute_child\u001B[1;34m(self, args, executable, preexec_fn, close_fds, pass_fds, cwd, env, startupinfo, creationflags, shell, p2cread, p2cwrite, c2pread, c2pwrite, errread, errwrite, unused_restore_signals, unused_gid, unused_gids, unused_uid, unused_umask, unused_start_new_session, unused_process_group)\u001B[0m\n\u001B[0;32m 1536\u001B[0m \u001B[38;5;66;03m# Start the process\u001B[39;00m\n\u001B[0;32m 1537\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[1;32m-> 1538\u001B[0m hp, ht, pid, tid \u001B[38;5;241m=\u001B[39m \u001B[43m_winapi\u001B[49m\u001B[38;5;241;43m.\u001B[39;49m\u001B[43mCreateProcess\u001B[49m\u001B[43m(\u001B[49m\u001B[43mexecutable\u001B[49m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[43margs\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1539\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;66;43;03m# no special security\u001B[39;49;00m\n\u001B[0;32m 1540\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m,\u001B[49m\u001B[43m \u001B[49m\u001B[38;5;28;43;01mNone\u001B[39;49;00m\u001B[43m,\u001B[49m\n\u001B[0;32m 1541\u001B[0m \u001B[43m \u001B[49m\u001B[38;5;28;43mint\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;129;43;01mnot\u001B[39;49;00m\u001B[43m \u001B[49m\u001B[43mclose_fds\u001B[49m\u001B[43m)\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1542\u001B[0m \u001B[43m \u001B[49m\u001B[43mcreationflags\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1543\u001B[0m \u001B[43m \u001B[49m\u001B[43menv\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1544\u001B[0m \u001B[43m \u001B[49m\u001B[43mcwd\u001B[49m\u001B[43m,\u001B[49m\n\u001B[0;32m 1545\u001B[0m \u001B[43m \u001B[49m\u001B[43mstartupinfo\u001B[49m\u001B[43m)\u001B[49m\n\u001B[0;32m 1546\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[0;32m 1547\u001B[0m \u001B[38;5;66;03m# Child is launched. Close the parent's copy of those pipe\u001B[39;00m\n\u001B[0;32m 1548\u001B[0m \u001B[38;5;66;03m# handles that only the child should have open. You need\u001B[39;00m\n\u001B[1;32m (...)\u001B[0m\n\u001B[0;32m 1551\u001B[0m \u001B[38;5;66;03m# pipe will not close when the child process exits and the\u001B[39;00m\n\u001B[0;32m 1552\u001B[0m \u001B[38;5;66;03m# ReadFile will hang.\u001B[39;00m\n\u001B[0;32m 1553\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_close_pipe_fds(p2cread, p2cwrite,\n\u001B[0;32m 1554\u001B[0m c2pread, c2pwrite,\n\u001B[0;32m 1555\u001B[0m errread, errwrite)\n",
"\u001B[1;31mFileNotFoundError\u001B[0m: [WinError 2] 系统找不到指定的文件。"
]
}
],
"source": [
"import json\n",
"import time\n",
"\n",
"import requests\n",
"from IPython.display import Audio,display\n",
"from pydub import AudioSegment\n",
"import os\n",
"import random\n",
"\n",
"with open('audio.json', encoding=\"UTF-8\") as f:\n",
" audios = json.load(f)\n",
"# print(type(audios[1])) # Output: dict\n",
"# TODO: 优化代码\n",
"\n",
"course_id = str(audios[1]['course_id'])\n",
"os.makedirs(course_id, exist_ok=True)\n",
"\n",
"# 遍历\n",
"audio_list = []\n",
"for audio in audios:\n",
" # t = random.randint(1, 10)\n",
" # time.sleep(1)\n",
" category = audio['category']\n",
" # requests.adapters.DEFAULT_RETRIES = 100\n",
" # 获取音频内容\n",
" i = 1\n",
" while(i >= 1):\n",
" try:\n",
" if category == \"PLAIN_AUDIO\":\n",
" url = audio['attachment']['url']\n",
" # print(url)\n",
" audio = requests.get(url)\n",
" # print(audio.status_code)\n",
" # display(Audio(audio.content))\n",
" audio_list.append(audio.content)\n",
" # print(audio['attachment']['url'])\n",
" # filename = os.path.basename(url)\n",
" # with open(os.path.join(course_id, filename), 'wb') as file:\n",
" # file.write(audio.content)\n",
" # 获取文本笔记\n",
" elif category == \"PLAIN_TEXT\":\n",
" text = audio['text']\n",
" print(text)\n",
" with open(os.path.join(course_id, course_id + 'note.txt'), 'a') as file:\n",
" file.write(text)\n",
" file.write(\"\\n\")\n",
" # 获取其他可下载附件\n",
" elif category != \"MESSAGE_RECALL\":\n",
" # print(audio['category'])\n",
" if 'attachment' in audio:\n",
" url = audio['attachment']['url']\n",
" # print(audio['attachment']['url'])\n",
" attachment = requests.get(url)\n",
" filename = os.path.basename(url)\n",
" with open(os.path.join(course_id, filename), 'wb') as file:\n",
" file.write(attachment.content)\n",
" # 获取其他内容\n",
" else:\n",
" print(audio['category'])\n",
" if 'attachment' in audio:\n",
" print(audio['attachment']['url'])\n",
" i = 0\n",
" except:\n",
" i += 1\n",
" print(\"get file failed\")\n",
" if 'attachment' in audio:\n",
" print(audio['attachment']['url'])\n",
"\n",
"# 处理获取所有音频文件\n",
"audio_seg_list = []\n",
"for i in audio_list:\n",
" with open('temp.mp3', 'wb') as file:\n",
" file.write(i)\n",
" audio_part = AudioSegment.from_mp3('temp.mp3')\n",
" # audio_part = AudioSegment.from_mp3(i)\n",
" audio_seg_list.append(audio_part)\n",
"\n",
"# print(len(audio_seg_list))\n",
"x = sum(audio_seg_list)\n",
"\n",
"# 递归求和\n",
"# def sumOfList(list, size):\n",
"# if (size == 0):\n",
"# return 0\n",
"# else:\n",
"# return list[size - 1] + sumOfList(list, size - 1)\n",
"#\n",
"# total = sumOfList(list1, len(list1))\n",
"\n",
"audio_name = course_id + '/' + course_id + '.mp3'\n",
"\n",
"x.export(audio_name, format=\"mp3\")\n",
"\n",
"display(Audio(audio_name))"
]
},
{
"cell_type": "markdown",
"source": [
"## 获取图片url"
],
"metadata": {
"collapsed": false
}
},
{
"cell_type": "code",
"execution_count": 1,
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Users\\75254\\PycharmProjects\\dt_audio_down\\venv\\Lib\\site-packages\\pydub\\utils.py:170: RuntimeWarning: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work\n",
" warn(\"Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work\", RuntimeWarning)\n"
]
}
],
"source": [
"# get all url and text\n",
"import json\n",
"import requests\n",
"from IPython.display import Audio,display\n",
"from pydub import AudioSegment\n",
"import os\n",
"\n",
"# get pic url list\n",
"\n",
"\n",
"with open('audio.json', encoding= \"UTF-8\") as f:\n",
" audios = json.load(f)\n",
"# print(type(audios[1])) # Output: dict\n",
"\n",
"course_id = str(audios[1]['course_id'])\n",
"# os.makedirs(course_id, exist_ok=True)\n",
"\n",
"audio_list = []\n",
"for audio in audios:\n",
" category = audio['category']\n",
" if category == \"PLAIN_IMAGE\":\n",
" url = audio['attachment']['url']\n",
" print(url)\n",
" # with open( course_id + '_pic_url.txt', 'a') as file:\n",
" # file.write(url)\n",
" # file.write(\"\\n\")"
],
"metadata": {
"collapsed": false,
"ExecuteTime": {
"end_time": "2023-07-05T10:56:34.184660400Z",
"start_time": "2023-07-05T10:56:34.073384900Z"
}
}
},
{
"cell_type": "code",
"execution_count": null,
"outputs": [],
"source": [],
"metadata": {
"collapsed": false
}
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 0
}