Files
songyi/course.py
2024-11-22 20:33:57 +08:00

202 lines
7.7 KiB
Python
Executable File

# -*- coding: utf-8 -*-
import shutil
import requests
import sqlite3
import os
import json
import subprocess
import configparser
from queue import Queue
from headers import headers
from threading import Thread
# 读取配置文件
config = configparser.ConfigParser()
config.read('config.ini')
authorization_token = config['DEFAULT']['authorization_token']
max_download_threads = int(config['DEFAULT']['max_download_threads'])
max_retry_attempts = int(config['DEFAULT']['max_retry_attempts'])
start_course_id = int(config['DEFAULT']['start_course_id'])
headers = headers
headers['authorization'] = f'Bearer {authorization_token}'
def download_attachment(attachment, course_id_folder, max_retries):
if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"):
print("字符串为空")
# 找到最后一个斜杠的位置
last_slash_index = attachment['url'].rfind('/')
# 截取最后一个斜杠之后的所有字符
download_filename = attachment['url'][last_slash_index + 1:]
print(attachment['url'])
else:
download_filename = attachment['name']
attempt = 0
while attempt < max_retries:
try:
url = attachment['url']
print(download_filename)
print(attachment['name'])
filename = os.path.join(course_id_folder, download_filename)
command = f"aria2c -o {filename} -x 16 -s 16 {url}"
print(command)
subprocess.run(command, shell=True, check=True)
return
except subprocess.CalledProcessError as e:
print(f"Failed to download {attachment['name']}: {e}")
attempt += 1
if attempt == max_retries:
print(f"Failed to download {attachment['name']} after {max_retries} attempts.")
else:
print(f"Retrying {attachment['name']}... ({attempt}/{max_retries})")
def worker(queue, course_id_folder, max_retries):
while not queue.empty():
attachment = queue.get()
download_attachment(attachment, course_id_folder, max_retries)
queue.task_done()
def get_course():
# 连接到SQLite数据库
conn = sqlite3.connect('courses.db')
cursor = conn.cursor()
# 查询courses表中的所有课程ID
cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,))
# cursor.execute('SELECT id, title FROM courses where id >= ')
course_ids_data = cursor.fetchall()
print(course_ids_data)
course_ids = [row[0] for row in course_ids_data]
course_ids_dict = dict(course_ids_data)
print(course_ids_dict)
print(course_ids)
# 创建json文件夹
if not os.path.exists('json'):
os.makedirs('json')
# 创建course文件夹
if not os.path.exists('course'):
os.makedirs('course')
# 先请求全部的链接获取数据,并将获取到的课程信息保存到数据库中
for course_id in course_ids:
# course_id = course_id_tuple[0]
print(f"Processing course ID: {course_id}")
json_filename = os.path.join('json', f'{course_id}.json')
if os.path.exists(json_filename):
print(f"Course {course_id} JSON file already exists, using local file.")
with open(json_filename, 'r', encoding='utf-8') as json_file:
contents_data = json.load(json_file)
else:
response = requests.get(f'https://bandu-api.songy.info/v2/courses/{course_id}/contents', headers=headers)
contents_data = response.json()
with open(json_filename, 'w', encoding='utf-8') as json_file:
json.dump(contents_data, json_file, ensure_ascii=False, indent=4)
for item in contents_data['data']:
cursor.execute('''
INSERT INTO contents (id, course_id, content, category, audio_order, attachment_url, mime_type)
VALUES (?, ?, ?, ?, ?, ?, ?)
ON CONFLICT(id) DO NOTHING
''', (item['id'], course_id, item['content'], item['category'], item['order'],
item['attachment']['url'] if item['attachment'] else None,
item['attachment']['mime_type'] if item['attachment'] else None))
conn.commit()
cursor.close()
conn.close()
# 现在所有的课程信息都已经保存到数据库中,开始下载附件和进行后续操作
for course_id in course_ids:
course_id_folder = os.path.join('course', str(course_id))
if not os.path.exists(course_id_folder):
os.makedirs(course_id_folder)
else:
print(f"Course {course_id} folder already exists, skipping download and merge operations.")
continue
json_filename = os.path.join('json', f'{course_id}.json')
with open(json_filename, 'r', encoding='utf-8') as json_file:
contents_data = json.load(json_file)
attachment_queue = Queue()
# 下载所有附件
for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]:
attachment_queue.put(attachment)
# 创建并启动多个下载线程
threads = []
for _ in range(max_download_threads):
t = Thread(target=worker, args=(attachment_queue, course_id_folder, max_retry_attempts))
t.start()
threads.append(t)
# 等待所有下载任务完成
attachment_queue.join()
for t in threads:
t.join()
# 检查是否存在音频文件
audio_files = [item for item in contents_data['data'] if item['category'] == 'audio']
if audio_files:
# 合并所有音频文件
audio_files.sort(key=lambda x: x['order'])
combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3')
if not os.path.exists(combined_audio_filename):
text_file = os.path.join(course_id_folder, 'input_files.txt')
with open(text_file, 'w') as f:
for audio_file in audio_files:
f.write(f"file '{audio_file['attachment']['name']}'\n")
ffmpeg_command = f"ffmpeg -f concat -safe 0 -i {text_file} -c copy {combined_audio_filename}"
subprocess.run(ffmpeg_command, shell=True)
os.remove(text_file)
# 删除下载的临时音频文件
for item in audio_files:
audio_file_path = os.path.join(course_id_folder, item['attachment']['name'])
try:
os.remove(audio_file_path)
except:
print('delete file fail')
# 整理文件
for item in contents_data['data']:
attachment = item['attachment']
if attachment:
filename = os.path.join(course_id_folder, attachment['name'])
if os.path.exists(filename):
file_extension = attachment['name'].split('.')[-1].lower()
folder_name = f"{file_extension}"
if folder_name == 'mp3':
continue
folder_path = os.path.join(course_id_folder, folder_name)
if not os.path.exists(folder_path):
os.makedirs(folder_path)
shutil.move(filename, os.path.join(folder_path, attachment['name']))
# 保存category为text的content到TXT文件
text_contents = [item['content'] for item in contents_data['data'] if item['category'] == 'text']
if text_contents:
with open(os.path.join(course_id_folder, f'{course_id}.txt'), 'w', encoding='utf-8') as txt_file:
for content in text_contents:
txt_file.write(content + '\n')
if __name__ == '__main__':
get_course()