diff --git a/course.py b/course.py index c629077..a6701d7 100755 --- a/course.py +++ b/course.py @@ -11,7 +11,17 @@ from threading import Thread import requests from headers import headers +import logging from video_voice_process import process_audio_file +from logging.handlers import RotatingFileHandler + +# 配置日志 +logging.basicConfig(level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + handlers=[ + logging.StreamHandler(), # 控制台日志 + RotatingFileHandler('app.log', maxBytes=1024*1024*5, backupCount=3) # 日志文件 + ]) # 读取配置文件 config = configparser.ConfigParser() @@ -26,13 +36,9 @@ headers['authorization'] = f'Bearer {authorization_token}' def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries): if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"): - print("字符串为空") - # 找到最后一个斜杠的位置 + logging.info("字符串为空") last_slash_index = attachment['url'].rfind('/') - - # 截取最后一个斜杠之后的所有字符 download_filename = attachment['url'][last_slash_index + 1:] - print(attachment['url']) else: download_filename = attachment['name'] @@ -40,35 +46,33 @@ def download_attachment(attachment, course_id_folder, course_audio_filename, max while attempt < max_retries: try: url = attachment['url'] - print(download_filename) - print(attachment['name']) file_extension = attachment['name'].split('.')[-1].lower() if file_extension != 'mp3': course_id_folder = os.path.join(course_id_folder, file_extension) else: if os.path.exists(course_audio_filename): - print(f"File {course_audio_filename} already exists, skipping download.") + logging.info(f"File {course_audio_filename} already exists, skipping download.") return filename = os.path.join(course_id_folder, download_filename) if os.path.exists(filename): - print(f"File {filename} already exists, skipping download.") + logging.info(f"File {filename} already exists, skipping download.") return command = f"aria2c -o {filename} -x 16 -s 16 {url}" - print(command) subprocess.run(command, shell=True, check=True) + logging.info(f"Download Command: {command}") return except subprocess.CalledProcessError as e: - print(f"Failed to download {attachment['name']}: {e}") + logging.error(f"Failed to download {attachment['name']}: {e}") attempt += 1 if attempt == max_retries: - print(f"Failed to download {attachment['name']} after {max_retries} attempts.") + logging.error(f"Failed to download {attachment['name']} after {max_retries} attempts.") else: - print(f"Retrying {attachment['name']}... ({attempt}/{max_retries})") + logging.warning(f"Retrying {attachment['name']}... ({attempt}/{max_retries})") def worker(queue, course_id_folder, course_audio_filename, max_retries): @@ -80,14 +84,10 @@ def worker(queue, course_id_folder, course_audio_filename, max_retries): def convert_mp4(mp4_file): try: - # 获取 MP4 文件所在的目录 mp4_dir = os.path.dirname(mp4_file) - # 获取 MP4 文件的文件名(不包含扩展名) mp4_filename = os.path.splitext(os.path.basename(mp4_file))[0] - # 生成对应的 WAV 文件路径 wav_file = os.path.join(mp4_dir, f"{mp4_filename}.wav") - # 构建 FFmpeg 命令 command = [ 'ffmpeg', '-y', @@ -99,56 +99,46 @@ def convert_mp4(mp4_file): wav_file ] - # 执行 FFmpeg 命令 subprocess.run(command, check=True) - print(f"成功将 {mp4_file} 转换为 {wav_file}") + logging.info(f"成功将 {mp4_file} 转换为 {wav_file}") return wav_file except subprocess.CalledProcessError as e: - print(f"转换失败: {e}") + logging.error(f"转换失败: {e}") return None except FileNotFoundError: - print("未找到 FFmpeg,请确保已安装并配置好 FFmpeg 环境。") + logging.error("未找到 FFmpeg,请确保已安装并配置好 FFmpeg 环境。") return None + def get_course(): - # 连接到SQLite数据库 conn = sqlite3.connect('courses.db') cursor = conn.cursor() - max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1') # 获取数据库中最大的课程ID + max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1') if max_course_id: max_course_id = max_course_id.fetchone()[0] - print(f"The maximum course ID is {max_course_id}") + logging.info(f"The maximum course ID is {max_course_id}") else: - print("No courses found in the database.") + logging.info("No courses found in the database.") max_course_id = 11 start_course_id = max_course_id - 5 - # 查询courses表中的所有课程ID cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,)) - # cursor.execute('SELECT id, title FROM courses where id >= ') course_ids_data = cursor.fetchall() - print(course_ids_data) course_ids = [row[0] for row in course_ids_data] course_ids_dict = dict(course_ids_data) - print(course_ids_dict) - print(course_ids) - # 创建json文件夹 if not os.path.exists('json'): os.makedirs('json') - # 创建course文件夹 if not os.path.exists('course'): os.makedirs('course') - # 先请求全部的链接获取数据,并将获取到的课程信息保存到数据库中 for course_id in course_ids: - # course_id = course_id_tuple[0] - print(f"Processing course ID: {course_id}") + logging.info(f"Processing course ID: {course_id}") json_filename = os.path.join('json', f'{course_id}.json') if os.path.exists(json_filename): - print(f"Course {course_id} JSON file already exists, using local file.") + logging.info(f"Course {course_id} JSON file already exists, using local file.") with open(json_filename, 'r', encoding='utf-8') as json_file: contents_data = json.load(json_file) else: @@ -171,7 +161,6 @@ def get_course(): cursor.close() conn.close() - # 现在所有的课程信息都已经保存到数据库中,开始下载附件和进行后续操作 for course_id in course_ids: course_id_folder = os.path.join('course', str(course_id)) @@ -185,27 +174,23 @@ def get_course(): attachment_queue = Queue() - # 下载所有附件 for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]: attachment_queue.put(attachment) - # 创建并启动多个下载线程 threads = [] for _ in range(max_download_threads): - t = Thread(target=worker, args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts)) + t = Thread(target=worker, + args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts)) t.start() threads.append(t) - # 等待所有下载任务完成 attachment_queue.join() for t in threads: t.join() - # 检查是否存在音频文件 audio_files = [item for item in contents_data['data'] if item['category'] == 'audio'] if audio_files: - # 合并所有音频文件 audio_files.sort(key=lambda x: x['order']) combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3') @@ -221,15 +206,13 @@ def get_course(): shutil.move(combined_audio_filename, course_audio_filename) os.remove(text_file) - # 删除下载的临时音频文件 for item in audio_files: audio_file_path = os.path.join(course_id_folder, item['attachment']['name']) try: os.remove(audio_file_path) except: - print('delete file fail') + logging.error('delete file fail') - # 整理文件 for item in contents_data['data']: attachment = item['attachment'] if attachment: diff --git a/courses.db b/courses.db index f1296b1..5f744cc 100755 Binary files a/courses.db and b/courses.db differ diff --git a/video_voice_process.py b/video_voice_process.py index b3be43a..5116250 100644 --- a/video_voice_process.py +++ b/video_voice_process.py @@ -11,8 +11,8 @@ from gradio_client import Client, handle_file from pydub import AudioSegment from pydub.silence import split_on_silence -use_remote_api = False -process_workers = 5 if use_remote_api else 2 +use_remote_api = True +process_workers = 5 if use_remote_api else 1 config = configparser.ConfigParser() config.read('config.ini') @@ -101,7 +101,7 @@ def process_audio_file(audio_file_path): def send_request(chunk, index, file_name_without_extension): audio_part_path = os.path.join('media', f"{file_name_without_extension}_chunk_{index}.wav") chunk.export(audio_part_path, format="wav") - logging.info(f'Exported chunk file {audio_part_path} for {file_name_without_extension}') + # logging.info(f'Exported chunk file {audio_part_path} for {file_name_without_extension}') try: if use_remote_api: multipart_form_data = { @@ -140,13 +140,18 @@ def send_request(chunk, index, file_name_without_extension): def main(): - all_files = os.listdir('media') - audio_files = [file for file in all_files if file.endswith('.wav')] + # all_files = os.listdir('media') + # audio_files = [file for file in all_files if file.endswith('.wav')] + audio_files = [] + for root, dirs, files in os.walk('media'): + for file in files: + if file.endswith('.wav'): + audio_files.append(os.path.join(root, file)) print(audio_files) with ThreadPoolExecutor(max_workers=process_workers) as executor: for audio_file in audio_files: - audio_file_path = os.path.join('media', audio_file) + audio_file_path = os.path.join(audio_file) executor.submit(process_audio_file, audio_file_path)