爬取音频文件
This commit is contained in:
1723
courses/the truth of wealth/course_data.json
Normal file
1723
courses/the truth of wealth/course_data.json
Normal file
File diff suppressed because it is too large
Load Diff
18
courses/the truth of wealth/get_course_ids.py
Normal file
18
courses/the truth of wealth/get_course_ids.py
Normal file
@@ -0,0 +1,18 @@
|
||||
import json
|
||||
|
||||
with open('course_data.json', encoding="UTF-8") as file:
|
||||
# courses = json.load(f)
|
||||
json_data = file.read()
|
||||
# 解析JSON数据
|
||||
data = json.loads(json_data)
|
||||
|
||||
# 初始化一个空数组来存储id值
|
||||
ids = []
|
||||
|
||||
# 遍历items列表
|
||||
for item in data['data']['items']:
|
||||
# 提取id并添加到数组中
|
||||
ids.append(item['id'])
|
||||
|
||||
# 打印结果
|
||||
print(ids)
|
||||
91
courses/the truth of wealth/parse_course.py
Normal file
91
courses/the truth of wealth/parse_course.py
Normal file
@@ -0,0 +1,91 @@
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from pydub import AudioSegment
|
||||
|
||||
|
||||
def get_course_id(date_file_path):
|
||||
with open(date_file_path, encoding="UTF-8") as file:
|
||||
# courses = json.load(f)
|
||||
json_data = file.read()
|
||||
# 解析JSON数据
|
||||
data = json.loads(json_data)
|
||||
|
||||
# 初始化一个空数组来存储id值
|
||||
ids = []
|
||||
|
||||
# 遍历items列表
|
||||
for item in data['data']['items']:
|
||||
# 提取id并添加到数组中
|
||||
ids.append(item['id'])
|
||||
return ids
|
||||
|
||||
|
||||
# 通过request来爬取课程信息json数据
|
||||
def request_date(course_id, request_token):
|
||||
url = 'https://bandu-api.songy.info/v2/courses/' + str(course_id) + '?expand=contents'
|
||||
headers = {"Authorization": "Bearer " + request_token}
|
||||
course_json = requests.get(url, headers=headers)
|
||||
return course_json.content
|
||||
|
||||
|
||||
# 下载MP3文件并按顺序合并
|
||||
def download_mp3(url, filename):
|
||||
response = requests.get(url, stream=True)
|
||||
if response.status_code == 200:
|
||||
with open(filename, 'wb') as f:
|
||||
for chunk in response.iter_content(chunk_size=8192):
|
||||
f.write(chunk)
|
||||
else:
|
||||
print(f"Failed to download {url}, status code {response.status_code}")
|
||||
|
||||
|
||||
# 合并下载的MP3文件
|
||||
def merge_mp3_files(mp3_files, output_filename):
|
||||
audio_segments = []
|
||||
for file in mp3_files:
|
||||
audio = AudioSegment.from_file(file)
|
||||
audio_segments.append(audio)
|
||||
|
||||
# 合并音频
|
||||
merged_audio = AudioSegment.empty()
|
||||
for audio_segment in audio_segments:
|
||||
merged_audio += audio_segment
|
||||
|
||||
# 导出合并后的音频
|
||||
merged_audio.export(output_filename, format="mp3")
|
||||
|
||||
|
||||
def get_audio(audio_data):
|
||||
# 解析JSON数据
|
||||
data = json.loads(audio_data)
|
||||
|
||||
# 提取MP3链接
|
||||
mp3_urls = [item["attachment"]["raw_url"] for item in data["data"]["course_contents"] if
|
||||
item["category"] == "audio"]
|
||||
|
||||
# 准备下载和合并MP3文件
|
||||
output_title = data["data"]["title"].replace(".", "_").replace("/", "_") # 替换文件名中不允许的字符
|
||||
output_filename = Path(f"{output_title}.mp3")
|
||||
temp_mp3_files = [Path(f"{idx}.mp3") for idx in range(len(mp3_urls))]
|
||||
|
||||
# 下载每个MP3文件
|
||||
for url, filename in zip(mp3_urls, temp_mp3_files):
|
||||
download_mp3(url, filename)
|
||||
|
||||
# 合并MP3文件
|
||||
merge_mp3_files(temp_mp3_files, output_filename)
|
||||
|
||||
# 清理临时文件
|
||||
for file in temp_mp3_files:
|
||||
file.unlink()
|
||||
|
||||
print(f"All MP3 files have been downloaded, merged into {output_filename}, and temporary files have been removed.")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
ids = get_course_id('course_data.json')
|
||||
token = "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhdWQiOiIxMDAwMDgzNDciLCJleHAiOjE3MTkxODk0ODQsImp0aSI6IjU3ZTJhMzdmLTMyZGEtNGQ2My1hZjQxLTY5NTRlNmU1OTg2MiIsImlhdCI6MTcxNjUxMTA4NCwiaXNzIjoiYXBwdXNlciIsInVpZCI6ImJlMmViOGIyLTFhOTItNGVmMC05ZDAwLTA1YTlkN2E2OWRiMiIsInNjaGVtZSI6Imp3dGhzIiwic2lkIjoiMWI4ZjE1ZTItYjQ5ZC00MmRmLWEwNDUtZmQxYTUwNzI5ZjkxIn0.IO7C2gtsi8lMdrOgWGNuxK-t2zzmDPvmI4BqISHeZEI"
|
||||
json_data = request_date(ids[0], token)
|
||||
get_audio(json_data)
|
||||
@@ -1,2 +1,3 @@
|
||||
requests~=2.31.0
|
||||
ipython~=8.24.0
|
||||
ipython~=8.24.0
|
||||
Scrapy~=2.11.2
|
||||
Reference in New Issue
Block a user