{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# 爬取定投课堂资料\n",
"通过解析课程json文件,获取对应资源的url从而下载所有资料\n",
"获取课程id"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"ExecuteTime": {
"end_time": "2023-07-05T10:56:39.054587Z",
"start_time": "2023-07-05T10:56:34.185665600Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"9月10日,对我们来说,应该是 “同学节”。\n"
]
},
{
"data": {
"text/html": [
"\n",
" \n",
" "
],
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import requests\n",
"import json\n",
"\n",
"from IPython.display import Audio,display\n",
"from pydub import AudioSegment\n",
"import os\n",
"\n",
"BearerToken = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhdWQiOiIxMDAwMDgzNDciLCJleHAiOjE3MjYyMzI1NTIsImp0aSI6IjU4ZDdhMmI1LWRmM2ItNDkwMy04NjQ1LTVmYThmNThhMDQwMSIsImlhdCI6MTcyNjE0NjE1MiwiaXNzIjoiYXBwdXNlciIsInVpZCI6ImJlMmViOGIyLTFhOTItNGVmMC05ZDAwLTA1YTlkN2E2OWRiMiIsInNjaGVtZSI6Imp3dGhzIiwic2lkIjoiMTE5YThmNmQtOTM1Ni00ZmU5LWIxMGQtNmMxODgyYTI5YTA4In0.1Ev8B_nRsJa-bXlJ2GSxp9Db20Uv6MgJ7LBbaDmmBEk'\n",
"\n",
"header = {\n",
" 'authority': 'bandu-api.songy.info',\n",
" 'accept': '*/*',\n",
" 'accept-language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',\n",
" 'authorization': 'Bearer ' + BearerToken,\n",
" 'dnt': '1',\n",
" 'origin': 'https://webapp.songy.info',\n",
" 'referer': 'https://webapp.songy.info/',\n",
" 'sec-ch-ua': '\"Not/A)Brand\";v=\"99\", \"Microsoft Edge\";v=\"115\", \"Chromium\";v=\"115\"',\n",
" 'sec-ch-ua-mobile': '?0',\n",
" 'sec-ch-ua-platform': '\"Windows\"',\n",
" 'sec-fetch-dest': 'empty',\n",
" 'sec-fetch-mode': 'cors',\n",
" 'sec-fetch-site': 'same-site',\n",
" 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36 Edg/115.0.1901.188',\n",
"}\n",
"\n",
"class_id = '80000002'\n",
"course_id = '573'\n",
"\n",
"# response = requests.get('https://bandu-api.songy.info/v1/communities/'+ class_id +'/courses/' + course_id + '/messages', headers=header)\n",
"response = requests.get('https://bandu-api.songy.info/v2/courses/' + course_id + '?expand=contents', headers=header)\n",
"# print(response.text)\n",
"\n",
"course = json.loads(response.text)\n",
"# print(course['data']['course_contents'])\n",
"\n",
"course_contents = course['data']['course_contents']\n",
"\n",
"audio_list = []\n",
"\n",
"for content in course_contents:\n",
" catagory = content['category']\n",
" match catagory:\n",
" case 'audio':\n",
" url = content['attachment']['url']\n",
" # print(url)\n",
" audio = requests.get(url)\n",
" # print(audio.status_code)\n",
" # display(Audio(audio.content))\n",
" audio_list.append(audio.content)\n",
" # print(audio['attachment']['url'])\n",
" # filename = os.path.basename(url)\n",
" # with open(os.path.join(course_id, filename), 'wb') as file:\n",
" # file.write(audio.content)\n",
" case 'text':\n",
" text = content['content']\n",
" print(text)\n",
" \n",
"audio_seg_list = []\n",
"for i in audio_list:\n",
" with open('temp.mp3', 'wb') as file:\n",
" file.write(i)\n",
" audio_part = AudioSegment.from_mp3('temp.mp3')\n",
" # audio_part = AudioSegment.from_mp3(i)\n",
" audio_seg_list.append(audio_part)\n",
"\n",
"# print(len(audio_seg_list))\n",
"x = sum(audio_seg_list)\n",
"\n",
"# 递归求和\n",
"# def sumOfList(list, size):\n",
"# if (size == 0):\n",
"# return 0\n",
"# else:\n",
"# return list[size - 1] + sumOfList(list, size - 1)\n",
"#\n",
"# total = sumOfList(list1, len(list1))\n",
"\n",
"audio_name = course['data']['title'] + '.mp3'\n",
"\n",
"x.export(audio_name, format=\"mp3\")\n",
"\n",
"display(Audio(audio_name))\n",
" \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 获取图片url"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"ExecuteTime": {
"end_time": "2023-07-05T10:56:34.184660400Z",
"start_time": "2023-07-05T10:56:34.073384900Z"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"https://xuexi-courses-storage.firesbox.com/7000102069/replay/78b6da46-ecd8-46fb-a857-423ca6da8196.png\n"
]
}
],
"source": [
"# get all url and text\n",
"import json\n",
"import requests\n",
"from IPython.display import Audio,display\n",
"from pydub import AudioSegment\n",
"import os\n",
"\n",
"# get pic url list\n",
"\n",
"\n",
"with open('audio.json', encoding= \"UTF-8\") as f:\n",
" audios = json.load(f)\n",
"# print(type(audios[1])) # Output: dict\n",
"\n",
"course_id = str(audios[1]['course_id'])\n",
"# os.makedirs(course_id, exist_ok=True)\n",
"\n",
"audio_list = []\n",
"for audio in audios:\n",
" category = audio['category']\n",
" if category == \"PLAIN_IMAGE\":\n",
" url = audio['attachment']['url']\n",
" print(url)\n",
" # with open( course_id + '_pic_url.txt', 'a') as file:\n",
" # file.write(url)\n",
" # file.write(\"\\n\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.6"
}
},
"nbformat": 4,
"nbformat_minor": 1
}