{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 爬取定投课堂资料\n", "通过解析课程json文件,获取对应资源的url从而下载所有资料\n", "获取课程id" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "ExecuteTime": { "end_time": "2023-07-05T10:56:39.054587Z", "start_time": "2023-07-05T10:56:34.185665600Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "9月10日,对我们来说,应该是 “同学节”。\n" ] }, { "data": { "text/html": [ "\n", " \n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import requests\n", "import json\n", "\n", "from IPython.display import Audio,display\n", "from pydub import AudioSegment\n", "import os\n", "\n", "BearerToken = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhdWQiOiIxMDAwMDgzNDciLCJleHAiOjE3MjYyMzI1NTIsImp0aSI6IjU4ZDdhMmI1LWRmM2ItNDkwMy04NjQ1LTVmYThmNThhMDQwMSIsImlhdCI6MTcyNjE0NjE1MiwiaXNzIjoiYXBwdXNlciIsInVpZCI6ImJlMmViOGIyLTFhOTItNGVmMC05ZDAwLTA1YTlkN2E2OWRiMiIsInNjaGVtZSI6Imp3dGhzIiwic2lkIjoiMTE5YThmNmQtOTM1Ni00ZmU5LWIxMGQtNmMxODgyYTI5YTA4In0.1Ev8B_nRsJa-bXlJ2GSxp9Db20Uv6MgJ7LBbaDmmBEk'\n", "\n", "header = {\n", " 'authority': 'bandu-api.songy.info',\n", " 'accept': '*/*',\n", " 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',\n", " 'authorization': 'Bearer ' + BearerToken,\n", " 'dnt': '1',\n", " 'origin': 'https://webapp.songy.info',\n", " 'referer': 'https://webapp.songy.info/',\n", " 'sec-ch-ua': '\"Not/A)Brand\";v=\"99\", \"Microsoft Edge\";v=\"115\", \"Chromium\";v=\"115\"',\n", " 'sec-ch-ua-mobile': '?0',\n", " 'sec-ch-ua-platform': '\"Windows\"',\n", " 'sec-fetch-dest': 'empty',\n", " 'sec-fetch-mode': 'cors',\n", " 'sec-fetch-site': 'same-site',\n", " 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188',\n", "}\n", "\n", "class_id = '80000002'\n", "course_id = '573'\n", "\n", "# response = requests.get('https://bandu-api.songy.info/v1/communities/'+ class_id +'/courses/' + course_id + '/messages', headers=header)\n", "response = requests.get('https://bandu-api.songy.info/v2/courses/' + course_id + '?expand=contents', headers=header)\n", "# print(response.text)\n", "\n", "course = json.loads(response.text)\n", "# print(course['data']['course_contents'])\n", "\n", "course_contents = course['data']['course_contents']\n", "\n", "audio_list = []\n", "\n", "for content in course_contents:\n", " catagory = content['category']\n", " match catagory:\n", " case 'audio':\n", " url = content['attachment']['url']\n", " # print(url)\n", " audio = requests.get(url)\n", " # print(audio.status_code)\n", " # display(Audio(audio.content))\n", " audio_list.append(audio.content)\n", " # print(audio['attachment']['url'])\n", " # filename = os.path.basename(url)\n", " # with open(os.path.join(course_id, filename), 'wb') as file:\n", " # file.write(audio.content)\n", " case 'text':\n", " text = content['content']\n", " print(text)\n", " \n", "audio_seg_list = []\n", "for i in audio_list:\n", " with open('temp.mp3', 'wb') as file:\n", " file.write(i)\n", " audio_part = AudioSegment.from_mp3('temp.mp3')\n", " # audio_part = AudioSegment.from_mp3(i)\n", " audio_seg_list.append(audio_part)\n", "\n", "# print(len(audio_seg_list))\n", "x = sum(audio_seg_list)\n", "\n", "# 递归求和\n", "# def sumOfList(list, size):\n", "# if (size == 0):\n", "# return 0\n", "# else:\n", "# return list[size - 1] + sumOfList(list, size - 1)\n", "#\n", "# total = sumOfList(list1, len(list1))\n", "\n", "audio_name = course['data']['title'] + '.mp3'\n", "\n", "x.export(audio_name, format=\"mp3\")\n", "\n", "display(Audio(audio_name))\n", " \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 获取图片url" ] }, { "cell_type": "code", "execution_count": 4, "metadata": { "ExecuteTime": { "end_time": "2023-07-05T10:56:34.184660400Z", "start_time": "2023-07-05T10:56:34.073384900Z" } }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "https://xuexi-courses-storage.firesbox.com/7000102069/replay/78b6da46-ecd8-46fb-a857-423ca6da8196.png\n" ] } ], "source": [ "# get all url and text\n", "import json\n", "import requests\n", "from IPython.display import Audio,display\n", "from pydub import AudioSegment\n", "import os\n", "\n", "# get pic url list\n", "\n", "\n", "with open('audio.json', encoding= \"UTF-8\") as f:\n", " audios = json.load(f)\n", "# print(type(audios[1])) # Output: dict\n", "\n", "course_id = str(audios[1]['course_id'])\n", "# os.makedirs(course_id, exist_ok=True)\n", "\n", "audio_list = []\n", "for audio in audios:\n", " category = audio['category']\n", " if category == \"PLAIN_IMAGE\":\n", " url = audio['attachment']['url']\n", " print(url)\n", " # with open( course_id + '_pic_url.txt', 'a') as file:\n", " # file.write(url)\n", " # file.write(\"\\n\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.6" } }, "nbformat": 4, "nbformat_minor": 1 }