# -*- coding: utf-8 -*- import configparser import json import os import shutil import sqlite3 import subprocess from queue import Queue from threading import Thread import requests from headers import headers from video_voice_process import process_audio_file # 读取配置文件 config = configparser.ConfigParser() config.read('config.ini') authorization_token = config['DEFAULT']['authorization_token'] max_download_threads = int(config['DEFAULT']['max_download_threads']) max_retry_attempts = int(config['DEFAULT']['max_retry_attempts']) headers = headers headers['authorization'] = f'Bearer {authorization_token}' def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries): if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"): print("字符串为空") # 找到最后一个斜杠的位置 last_slash_index = attachment['url'].rfind('/') # 截取最后一个斜杠之后的所有字符 download_filename = attachment['url'][last_slash_index + 1:] print(attachment['url']) else: download_filename = attachment['name'] attempt = 0 while attempt < max_retries: try: url = attachment['url'] print(download_filename) print(attachment['name']) file_extension = attachment['name'].split('.')[-1].lower() if file_extension != 'mp3': course_id_folder = os.path.join(course_id_folder, file_extension) else: if os.path.exists(course_audio_filename): print(f"File {course_audio_filename} already exists, skipping download.") return filename = os.path.join(course_id_folder, download_filename) if os.path.exists(filename): print(f"File {filename} already exists, skipping download.") return command = f"aria2c -o {filename} -x 16 -s 16 {url}" print(command) subprocess.run(command, shell=True, check=True) return except subprocess.CalledProcessError as e: print(f"Failed to download {attachment['name']}: {e}") attempt += 1 if attempt == max_retries: print(f"Failed to download {attachment['name']} after {max_retries} attempts.") else: print(f"Retrying {attachment['name']}... ({attempt}/{max_retries})") def worker(queue, course_id_folder, course_audio_filename, max_retries): while not queue.empty(): attachment = queue.get() download_attachment(attachment, course_id_folder, course_audio_filename, max_retries) queue.task_done() def convert_mp4(mp4_file): try: # 获取 MP4 文件所在的目录 mp4_dir = os.path.dirname(mp4_file) # 获取 MP4 文件的文件名(不包含扩展名) mp4_filename = os.path.splitext(os.path.basename(mp4_file))[0] # 生成对应的 WAV 文件路径 wav_file = os.path.join(mp4_dir, f"{mp4_filename}.wav") # 构建 FFmpeg 命令 command = [ 'ffmpeg', '-y', '-i', mp4_file, '-vn', # 去除视频流 '-acodec', 'pcm_s16le', # 使用 PCM 16 位有符号小端编码 '-ar', '44100', # 设置采样率为 44100 Hz '-ac', '2', # 设置声道数为 2(立体声) wav_file ] # 执行 FFmpeg 命令 subprocess.run(command, check=True) print(f"成功将 {mp4_file} 转换为 {wav_file}") return wav_file except subprocess.CalledProcessError as e: print(f"转换失败: {e}") return None except FileNotFoundError: print("未找到 FFmpeg,请确保已安装并配置好 FFmpeg 环境。") return None def get_course(): # 连接到SQLite数据库 conn = sqlite3.connect('courses.db') cursor = conn.cursor() max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1') # 获取数据库中最大的课程ID if max_course_id: max_course_id = max_course_id.fetchone()[0] print(f"The maximum course ID is {max_course_id}") else: print("No courses found in the database.") max_course_id = 11 start_course_id = max_course_id - 5 # 查询courses表中的所有课程ID cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,)) # cursor.execute('SELECT id, title FROM courses where id >= ') course_ids_data = cursor.fetchall() print(course_ids_data) course_ids = [row[0] for row in course_ids_data] course_ids_dict = dict(course_ids_data) print(course_ids_dict) print(course_ids) # 创建json文件夹 if not os.path.exists('json'): os.makedirs('json') # 创建course文件夹 if not os.path.exists('course'): os.makedirs('course') # 先请求全部的链接获取数据,并将获取到的课程信息保存到数据库中 for course_id in course_ids: # course_id = course_id_tuple[0] print(f"Processing course ID: {course_id}") json_filename = os.path.join('json', f'{course_id}.json') if os.path.exists(json_filename): print(f"Course {course_id} JSON file already exists, using local file.") with open(json_filename, 'r', encoding='utf-8') as json_file: contents_data = json.load(json_file) else: response = requests.get(f'https://bandu-api.songy.info/v2/courses/{course_id}/contents', headers=headers) contents_data = response.json() with open(json_filename, 'w', encoding='utf-8') as json_file: json.dump(contents_data, json_file, ensure_ascii=False, indent=4) for item in contents_data['data']: cursor.execute(''' INSERT INTO contents (id, course_id, content, category, audio_order, attachment_url, mime_type) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(id) DO NOTHING ''', (item['id'], course_id, item['content'], item['category'], item['order'], item['attachment']['url'] if item['attachment'] else None, item['attachment']['mime_type'] if item['attachment'] else None)) conn.commit() cursor.close() conn.close() # 现在所有的课程信息都已经保存到数据库中,开始下载附件和进行后续操作 for course_id in course_ids: course_id_folder = os.path.join('course', str(course_id)) if not os.path.exists(course_id_folder): os.makedirs(course_id_folder) json_filename = os.path.join('json', f'{course_id}.json') with open(json_filename, 'r', encoding='utf-8') as json_file: contents_data = json.load(json_file) course_audio_filename = os.path.join(course_id_folder, f'{course_ids_dict[course_id]}.mp3') attachment_queue = Queue() # 下载所有附件 for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]: attachment_queue.put(attachment) # 创建并启动多个下载线程 threads = [] for _ in range(max_download_threads): t = Thread(target=worker, args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts)) t.start() threads.append(t) # 等待所有下载任务完成 attachment_queue.join() for t in threads: t.join() # 检查是否存在音频文件 audio_files = [item for item in contents_data['data'] if item['category'] == 'audio'] if audio_files: # 合并所有音频文件 audio_files.sort(key=lambda x: x['order']) combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3') if not os.path.exists(course_audio_filename): text_file = os.path.join(course_id_folder, 'input_files.txt') with open(text_file, 'w') as f: for audio_file in audio_files: f.write(f"file '{audio_file['attachment']['name']}'\n") ffmpeg_command = f"ffmpeg -f concat -safe 0 -i {text_file} -c copy {combined_audio_filename}" subprocess.run(ffmpeg_command, shell=True) shutil.move(combined_audio_filename, course_audio_filename) os.remove(text_file) # 删除下载的临时音频文件 for item in audio_files: audio_file_path = os.path.join(course_id_folder, item['attachment']['name']) try: os.remove(audio_file_path) except: print('delete file fail') # 整理文件 for item in contents_data['data']: attachment = item['attachment'] if attachment: filename = os.path.join(course_id_folder, attachment['name']) if os.path.exists(filename): file_extension = attachment['name'].split('.')[-1].lower() folder_name = f"{file_extension}" if folder_name == 'mp3': continue folder_path = os.path.join(course_id_folder, folder_name) if not os.path.exists(folder_path): os.makedirs(folder_path) move_file = os.path.join(folder_path, attachment['name']); shutil.move(filename, move_file) # 保存category为text的content到TXT文件 text_contents = [item['content'] for item in contents_data['data'] if item['category'] == 'text'] if text_contents: with open(os.path.join(course_id_folder, f'{course_id}.txt'), 'w', encoding='utf-8') as txt_file: for content in text_contents: txt_file.write(content + '\n') # 处理mp4文件 mp4_folder = os.path.join(course_id_folder, 'mp4') if os.path.exists(mp4_folder): # 遍历指定文件夹内的所有文件和子文件夹 for root, dirs, files in os.walk(mp4_folder): for file in files: # 检查文件扩展名是否为.mp4 if file.lower().endswith('.mp4'): # 构建完整的 MP4 文件路径 mp4_file = os.path.join(root, file) # 调用 mp4_to_wav 函数进行转换 wav_file = convert_mp4(mp4_file) if wav_file is not None: process_audio_file(wav_file) if __name__ == '__main__': get_course()