246 lines
11 KiB
Python
Executable File
246 lines
11 KiB
Python
Executable File
# -*- coding: utf-8 -*-
|
|
import configparser
|
|
import os
|
|
import shutil
|
|
import sqlite3
|
|
import subprocess
|
|
from queue import Queue
|
|
from threading import Thread
|
|
|
|
import requests
|
|
|
|
import json
|
|
from headers import headers
|
|
from logging_config import setup_logging
|
|
from transcribe_media import convert_media
|
|
|
|
logger = setup_logging()
|
|
|
|
# 读取配置文件
|
|
config = configparser.ConfigParser()
|
|
config.read('config.ini')
|
|
authorization_token = config['DEFAULT']['authorization_token']
|
|
max_download_threads = int(config['DEFAULT']['max_download_threads'])
|
|
max_retry_attempts = int(config['DEFAULT']['max_retry_attempts'])
|
|
download_id = int(config['DEFAULT']['download_id'])
|
|
|
|
headers = headers
|
|
headers['authorization'] = f'Bearer {authorization_token}'
|
|
|
|
|
|
def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries):
|
|
if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"):
|
|
logger.info("字符串为空")
|
|
last_slash_index = attachment['url'].rfind('/')
|
|
download_filename = attachment['url'][last_slash_index + 1:]
|
|
else:
|
|
download_filename = attachment['name']
|
|
|
|
attempt = 0
|
|
while attempt < max_retries:
|
|
try:
|
|
url = attachment['url']
|
|
|
|
file_extension = attachment['name'].split('.')[-1].lower()
|
|
if file_extension != 'mp3':
|
|
course_id_folder = os.path.join(course_id_folder, file_extension)
|
|
else:
|
|
if os.path.exists(course_audio_filename):
|
|
logger.info(f"File {course_audio_filename} already exists, skipping download.")
|
|
return
|
|
|
|
filename = os.path.join(course_id_folder, download_filename)
|
|
|
|
if os.path.exists(filename):
|
|
logger.info(f"File {filename} already exists, skipping download.")
|
|
return
|
|
|
|
command = f'aria2c -o "{filename}" -x 16 -s 16 {url}'
|
|
subprocess.run(command, shell=True, check=True)
|
|
logger.info(f"Download Command: {command}")
|
|
|
|
return
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Failed to download {attachment['name']}: {e}")
|
|
attempt += 1
|
|
if attempt == max_retries:
|
|
logger.error(f"Failed to download {attachment['name']} after {max_retries} attempts.")
|
|
else:
|
|
logger.warning(f"Retrying {attachment['name']}... ({attempt}/{max_retries})")
|
|
|
|
|
|
def worker(queue, course_id_folder, course_audio_filename, max_retries):
|
|
while not queue.empty():
|
|
attachment = queue.get()
|
|
download_attachment(attachment, course_id_folder, course_audio_filename, max_retries)
|
|
queue.task_done()
|
|
|
|
|
|
def fetch_course(courseIds):
|
|
conn = sqlite3.connect('courses.db')
|
|
cursor = conn.cursor()
|
|
max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1')
|
|
if max_course_id:
|
|
max_course_id = max_course_id.fetchone()[0]
|
|
logger.info(f"The maximum course ID is {max_course_id}")
|
|
else:
|
|
logger.info("No courses found in the database.")
|
|
max_course_id = 11
|
|
|
|
start_course_id = download_id if max_course_id - 3 < download_id else max_course_id - 3
|
|
if courseIds is None:
|
|
cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,))
|
|
else:
|
|
cursor.execute('SELECT id, title FROM courses WHERE id IN ({})'.format(','.join('?' * len(courseIds))), courseIds)
|
|
course_ids_data = cursor.fetchall()
|
|
course_ids = [row[0] for row in course_ids_data]
|
|
course_ids_dict = dict(course_ids_data)
|
|
|
|
if not os.path.exists('json'):
|
|
os.makedirs('json')
|
|
|
|
if not os.path.exists('course'):
|
|
os.makedirs('course')
|
|
|
|
for course_id in course_ids:
|
|
logger.info(f"Processing course ID: {course_id}")
|
|
|
|
json_filename = os.path.join('json', f'{course_id}.json')
|
|
if os.path.exists(json_filename):
|
|
logger.info(f"Course {course_id} JSON file already exists, using local file.")
|
|
with open(json_filename, 'r', encoding='utf-8') as json_file:
|
|
contents_data = json.load(json_file)
|
|
else:
|
|
response = requests.get(f'https://bandu-api.songy.info/v2/courses/{course_id}/contents', headers=headers)
|
|
contents_data = response.json()
|
|
with open(json_filename, 'w', encoding='utf-8') as save_json_file:
|
|
json.dump(contents_data, save_json_file, ensure_ascii=False, indent=4)
|
|
|
|
for item in contents_data['data']:
|
|
cursor.execute('''
|
|
INSERT INTO contents (id, course_id, content, category, audio_order, attachment_url,
|
|
mime_type)
|
|
VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(id) DO NOTHING
|
|
''', (item['id'], course_id, item['content'], item['category'], item['order'],
|
|
item['attachment']['url'] if item['attachment'] else None,
|
|
item['attachment']['mime_type'] if item['attachment'] else None))
|
|
return course_ids, course_ids_dict
|
|
|
|
|
|
def download_course_contents(course_ids, course_ids_dict):
|
|
if not os.path.exists('json'):
|
|
os.makedirs('json')
|
|
|
|
if not os.path.exists('course'):
|
|
os.makedirs('course')
|
|
|
|
for course_id in course_ids:
|
|
course_id_folder = os.path.join('course', str(course_id))
|
|
|
|
if not os.path.exists(course_id_folder):
|
|
os.makedirs(course_id_folder)
|
|
|
|
json_filename = os.path.join('json', f'{course_id}.json')
|
|
with open(json_filename, 'r', encoding='utf-8') as json_file:
|
|
contents_data = json.load(json_file)
|
|
course_audio_filename = os.path.join(course_id_folder, f'{course_ids_dict[course_id]}.mp3')
|
|
|
|
attachment_queue = Queue()
|
|
|
|
for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]:
|
|
attachment_queue.put(attachment)
|
|
|
|
threads = []
|
|
for _ in range(max_download_threads):
|
|
t = Thread(target=worker,
|
|
args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts))
|
|
t.start()
|
|
threads.append(t)
|
|
|
|
attachment_queue.join()
|
|
|
|
for t in threads:
|
|
t.join()
|
|
|
|
audio_files = [item for item in contents_data['data'] if item['category'] == 'audio']
|
|
if audio_files:
|
|
audio_files.sort(key=lambda x: x['order'])
|
|
|
|
combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3')
|
|
|
|
if not os.path.exists(course_audio_filename):
|
|
text_file = os.path.join(course_id_folder, 'input_files.txt')
|
|
with open(text_file, 'w') as f:
|
|
for audio_file in audio_files:
|
|
f.write(f"file '{audio_file['attachment']['name']}'\n")
|
|
|
|
ffmpeg_command = f'ffmpeg -f concat -safe 0 -i {text_file} -c copy "{combined_audio_filename}"'
|
|
subprocess.run(ffmpeg_command, shell=True)
|
|
shutil.move(combined_audio_filename, course_audio_filename)
|
|
os.remove(text_file)
|
|
|
|
# 删除音频文件
|
|
for item in audio_files:
|
|
audio_file_path = os.path.join(course_id_folder, item['attachment']['name'])
|
|
try:
|
|
os.remove(audio_file_path)
|
|
except:
|
|
logger.error('delete file fail')
|
|
|
|
for item in contents_data['data']:
|
|
attachment = item['attachment']
|
|
if attachment:
|
|
filename = os.path.join(course_id_folder, attachment['name'])
|
|
if os.path.exists(filename):
|
|
file_extension = attachment['name'].split('.')[-1].lower()
|
|
folder_name = f"{file_extension}"
|
|
if folder_name == 'mp3':
|
|
continue
|
|
folder_path = os.path.join(course_id_folder, folder_name)
|
|
if not os.path.exists(folder_path):
|
|
os.makedirs(folder_path)
|
|
move_file = os.path.join(folder_path, attachment['name'])
|
|
shutil.move(filename, move_file)
|
|
|
|
# 保存category为text的content到TXT文件
|
|
text_contents = [item['content'] for item in contents_data['data'] if item['category'] == 'text']
|
|
if text_contents:
|
|
with open(os.path.join(course_id_folder, f'{course_id}.txt'), 'w', encoding='utf-8') as txt_file:
|
|
for content in text_contents:
|
|
txt_file.write(content + '\n')
|
|
# 处理mp4文件
|
|
mp4_folder = os.path.join(course_id_folder, 'mp4')
|
|
mp4_file = None
|
|
exist_md_file = False
|
|
if os.path.exists(mp4_folder):
|
|
# 遍历指定文件夹内的所有文件和子文件夹
|
|
for root, dirs, files in os.walk(mp4_folder):
|
|
for file in files:
|
|
# 检查是否已经存在
|
|
if file.lower().endswith('.md'):
|
|
exist_md_file = True
|
|
# 检查文件扩展名是否为.mp4
|
|
if file.lower().endswith('.mp4'):
|
|
# 构建完整的 MP4 文件路径
|
|
mp4_file = os.path.join(root, file)
|
|
if (not exist_md_file) and mp4_file is not None:
|
|
convert_media(mp4_file)
|
|
|
|
|
|
def get_course(course_list=None):
|
|
course_ids, course_ids_dict = fetch_course(course_list)
|
|
download_course_contents(course_ids, course_ids_dict)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# 多语的真相 381, 380, 382, 384, 385, 386, 387, 388, 391, 392, 393, 394, 399, 400, 402, 408, 409, 413, 414, 415, 423, 424, 534, 554
|
|
# 写作课 225, 226, 228, 230, 231, 232, 238, 240, 564, 565, 566, 567, 568, 569, 243
|
|
# 茶话会 87, 88, 89, 90, 91, 92, 93, 94, 97, 147, 177, 186, 190, 213, 219, 224, 235, 242, 263, 278, 289, 297, 305, 383, 398, 410, 418, 433, 505, 545, 582, 620, 638, 651, 664, 676, 686
|
|
# 学员故事 364, 367, 368, 370, 371, 372, 373, 374, 375, 376, 377, 378, 313, 320, 319, 406, 405, 404, 401, 403, 411, 416, 417, 421, 422, 427, 430, 447, 438, 444, 446, 485, 487, 489, 493, 496, 498, 502, 507, 509, 513, 515, 518, 519, 529, 537, 544, 557, 575, 584, 590, 595, 598, 599, 606, 608, 611, 613, 618, 621, 624, 628, 633, 635, 637, 653, 642, 645, 648, 652, 655, 657, 662, 665, 671, 673, 674, 677, 680, 682, 685, 687
|
|
# 相约七年后 484, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 60, 61, 86, 95, 148, 233, 246, 273, 291, 317, 396, 412, 426, 442, 492, 506, 552, 589, 607, 619, 644, 656, 672, 683
|
|
# 笑来分享合集 191, 192, 193, 194, 195, 196, 214, 215, 216, 217, 220, 221, 222, 227, 229, 234, 236, 237, 239, 241, 243, 244, 245, 247, 248, 249, 253, 290, 292, 390, 395, 397, 399, 400, 402, 409, 408, 414, 419, 423, 428, 429, 435, 436, 439, 440, 443, 445, 479, 488, 490, 491, 495, 497, 499, 500, 503, 508, 510, 514, 516, 517, 520, 521, 530, 532, 534, 538, 543, 547, 549, 554, 558, 562, 563, 570, 573, 578, 597, 609, 610, 612, 615, 616, 617, 622, 623, 626, 631, 634, 641, 640, 643, 647, 650, 654, 661, 668, 675, 679
|
|
# 人工智能 585, 586, 587, 588, 591, 592, 593, 594, 601, 602, 604, 605
|
|
# course_list = [585, 586, 587, 588, 591, 592, 593, 594, 601, 602, 604, 605]
|
|
course_list = None
|
|
get_course(course_list)
|