# -*- coding: utf-8 -*- import configparser import json import os import shutil import sqlite3 import subprocess from queue import Queue from threading import Thread import requests from headers import headers import logging from video_voice_process import process_audio_file from logging.handlers import RotatingFileHandler # 配置日志 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', handlers=[ logging.StreamHandler(), # 控制台日志 RotatingFileHandler('app.log', maxBytes=1024*1024*5, backupCount=3) # 日志文件 ]) # 读取配置文件 config = configparser.ConfigParser() config.read('config.ini') authorization_token = config['DEFAULT']['authorization_token'] max_download_threads = int(config['DEFAULT']['max_download_threads']) max_retry_attempts = int(config['DEFAULT']['max_retry_attempts']) headers = headers headers['authorization'] = f'Bearer {authorization_token}' def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries): if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"): logging.info("字符串为空") last_slash_index = attachment['url'].rfind('/') download_filename = attachment['url'][last_slash_index + 1:] else: download_filename = attachment['name'] attempt = 0 while attempt < max_retries: try: url = attachment['url'] file_extension = attachment['name'].split('.')[-1].lower() if file_extension != 'mp3': course_id_folder = os.path.join(course_id_folder, file_extension) else: if os.path.exists(course_audio_filename): logging.info(f"File {course_audio_filename} already exists, skipping download.") return filename = os.path.join(course_id_folder, download_filename) if os.path.exists(filename): logging.info(f"File {filename} already exists, skipping download.") return command = f"aria2c -o {filename} -x 16 -s 16 {url}" subprocess.run(command, shell=True, check=True) logging.info(f"Download Command: {command}") return except subprocess.CalledProcessError as e: logging.error(f"Failed to download {attachment['name']}: {e}") attempt += 1 if attempt == max_retries: logging.error(f"Failed to download {attachment['name']} after {max_retries} attempts.") else: logging.warning(f"Retrying {attachment['name']}... ({attempt}/{max_retries})") def worker(queue, course_id_folder, course_audio_filename, max_retries): while not queue.empty(): attachment = queue.get() download_attachment(attachment, course_id_folder, course_audio_filename, max_retries) queue.task_done() def convert_mp4(mp4_file): try: mp4_dir = os.path.dirname(mp4_file) mp4_filename = os.path.splitext(os.path.basename(mp4_file))[0] wav_file = os.path.join(mp4_dir, f"{mp4_filename}.wav") command = [ 'ffmpeg', '-y', '-i', mp4_file, '-vn', # 去除视频流 '-acodec', 'pcm_s16le', # 使用 PCM 16 位有符号小端编码 '-ar', '44100', # 设置采样率为 44100 Hz '-ac', '2', # 设置声道数为 2(立体声) wav_file ] subprocess.run(command, check=True) logging.info(f"成功将 {mp4_file} 转换为 {wav_file}") return wav_file except subprocess.CalledProcessError as e: logging.error(f"转换失败: {e}") return None except FileNotFoundError: logging.error("未找到 FFmpeg,请确保已安装并配置好 FFmpeg 环境。") return None def get_course(): conn = sqlite3.connect('courses.db') cursor = conn.cursor() max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1') if max_course_id: max_course_id = max_course_id.fetchone()[0] logging.info(f"The maximum course ID is {max_course_id}") else: logging.info("No courses found in the database.") max_course_id = 11 start_course_id = max_course_id - 5 cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,)) course_ids_data = cursor.fetchall() course_ids = [row[0] for row in course_ids_data] course_ids_dict = dict(course_ids_data) if not os.path.exists('json'): os.makedirs('json') if not os.path.exists('course'): os.makedirs('course') for course_id in course_ids: logging.info(f"Processing course ID: {course_id}") json_filename = os.path.join('json', f'{course_id}.json') if os.path.exists(json_filename): logging.info(f"Course {course_id} JSON file already exists, using local file.") with open(json_filename, 'r', encoding='utf-8') as json_file: contents_data = json.load(json_file) else: response = requests.get(f'https://bandu-api.songy.info/v2/courses/{course_id}/contents', headers=headers) contents_data = response.json() with open(json_filename, 'w', encoding='utf-8') as save_json_file: json.dump(contents_data, save_json_file, ensure_ascii=False, indent=4) for item in contents_data['data']: cursor.execute(''' INSERT INTO contents (id, course_id, content, category, audio_order, attachment_url, mime_type) VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(id) DO NOTHING ''', (item['id'], course_id, item['content'], item['category'], item['order'], item['attachment']['url'] if item['attachment'] else None, item['attachment']['mime_type'] if item['attachment'] else None)) conn.commit() cursor.close() conn.close() for course_id in course_ids: course_id_folder = os.path.join('course', str(course_id)) if not os.path.exists(course_id_folder): os.makedirs(course_id_folder) json_filename = os.path.join('json', f'{course_id}.json') with open(json_filename, 'r', encoding='utf-8') as json_file: contents_data = json.load(json_file) course_audio_filename = os.path.join(course_id_folder, f'{course_ids_dict[course_id]}.mp3') attachment_queue = Queue() for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]: attachment_queue.put(attachment) threads = [] for _ in range(max_download_threads): t = Thread(target=worker, args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts)) t.start() threads.append(t) attachment_queue.join() for t in threads: t.join() audio_files = [item for item in contents_data['data'] if item['category'] == 'audio'] if audio_files: audio_files.sort(key=lambda x: x['order']) combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3') if not os.path.exists(course_audio_filename): text_file = os.path.join(course_id_folder, 'input_files.txt') with open(text_file, 'w') as f: for audio_file in audio_files: f.write(f"file '{audio_file['attachment']['name']}'\n") ffmpeg_command = f"ffmpeg -f concat -safe 0 -i {text_file} -c copy {combined_audio_filename}" subprocess.run(ffmpeg_command, shell=True) shutil.move(combined_audio_filename, course_audio_filename) os.remove(text_file) for item in audio_files: audio_file_path = os.path.join(course_id_folder, item['attachment']['name']) try: os.remove(audio_file_path) except: logging.error('delete file fail') for item in contents_data['data']: attachment = item['attachment'] if attachment: filename = os.path.join(course_id_folder, attachment['name']) if os.path.exists(filename): file_extension = attachment['name'].split('.')[-1].lower() folder_name = f"{file_extension}" if folder_name == 'mp3': continue folder_path = os.path.join(course_id_folder, folder_name) if not os.path.exists(folder_path): os.makedirs(folder_path) move_file = os.path.join(folder_path, attachment['name']) shutil.move(filename, move_file) # 保存category为text的content到TXT文件 text_contents = [item['content'] for item in contents_data['data'] if item['category'] == 'text'] if text_contents: with open(os.path.join(course_id_folder, f'{course_id}.txt'), 'w', encoding='utf-8') as txt_file: for content in text_contents: txt_file.write(content + '\n') # 处理mp4文件 mp4_folder = os.path.join(course_id_folder, 'mp4') mp4_file = None exist_md_file = False if os.path.exists(mp4_folder): # 遍历指定文件夹内的所有文件和子文件夹 for root, dirs, files in os.walk(mp4_folder): for file in files: # 检查是否已经存在 if file.lower().endswith('.md'): exist_md_file = True # 检查文件扩展名是否为.mp4 if file.lower().endswith('.mp4'): # 构建完整的 MP4 文件路径 mp4_file = os.path.join(root, file) if (not exist_md_file) and mp4_file is not None: # 调用 mp4_to_wav 函数进行转换 wav_file = convert_mp4(mp4_file) if wav_file is not None: try: process_audio_file(wav_file) except: print('process_audio_file fail') if __name__ == '__main__': get_course()