优化代码

This commit is contained in:
YuanHui
2025-03-07 15:10:44 +08:00
parent 5b5570ccc8
commit 05b8744a9d
3 changed files with 40 additions and 52 deletions

View File

@@ -11,7 +11,17 @@ from threading import Thread
import requests import requests
from headers import headers from headers import headers
import logging
from video_voice_process import process_audio_file from video_voice_process import process_audio_file
from logging.handlers import RotatingFileHandler
# 配置日志
logging.basicConfig(level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
handlers=[
logging.StreamHandler(), # 控制台日志
RotatingFileHandler('app.log', maxBytes=1024*1024*5, backupCount=3) # 日志文件
])
# 读取配置文件 # 读取配置文件
config = configparser.ConfigParser() config = configparser.ConfigParser()
@@ -26,13 +36,9 @@ headers['authorization'] = f'Bearer {authorization_token}'
def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries): def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries):
if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"): if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"):
print("字符串为空") logging.info("字符串为空")
# 找到最后一个斜杠的位置
last_slash_index = attachment['url'].rfind('/') last_slash_index = attachment['url'].rfind('/')
# 截取最后一个斜杠之后的所有字符
download_filename = attachment['url'][last_slash_index + 1:] download_filename = attachment['url'][last_slash_index + 1:]
print(attachment['url'])
else: else:
download_filename = attachment['name'] download_filename = attachment['name']
@@ -40,35 +46,33 @@ def download_attachment(attachment, course_id_folder, course_audio_filename, max
while attempt < max_retries: while attempt < max_retries:
try: try:
url = attachment['url'] url = attachment['url']
print(download_filename)
print(attachment['name'])
file_extension = attachment['name'].split('.')[-1].lower() file_extension = attachment['name'].split('.')[-1].lower()
if file_extension != 'mp3': if file_extension != 'mp3':
course_id_folder = os.path.join(course_id_folder, file_extension) course_id_folder = os.path.join(course_id_folder, file_extension)
else: else:
if os.path.exists(course_audio_filename): if os.path.exists(course_audio_filename):
print(f"File {course_audio_filename} already exists, skipping download.") logging.info(f"File {course_audio_filename} already exists, skipping download.")
return return
filename = os.path.join(course_id_folder, download_filename) filename = os.path.join(course_id_folder, download_filename)
if os.path.exists(filename): if os.path.exists(filename):
print(f"File {filename} already exists, skipping download.") logging.info(f"File {filename} already exists, skipping download.")
return return
command = f"aria2c -o {filename} -x 16 -s 16 {url}" command = f"aria2c -o {filename} -x 16 -s 16 {url}"
print(command)
subprocess.run(command, shell=True, check=True) subprocess.run(command, shell=True, check=True)
logging.info(f"Download Command: {command}")
return return
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"Failed to download {attachment['name']}: {e}") logging.error(f"Failed to download {attachment['name']}: {e}")
attempt += 1 attempt += 1
if attempt == max_retries: if attempt == max_retries:
print(f"Failed to download {attachment['name']} after {max_retries} attempts.") logging.error(f"Failed to download {attachment['name']} after {max_retries} attempts.")
else: else:
print(f"Retrying {attachment['name']}... ({attempt}/{max_retries})") logging.warning(f"Retrying {attachment['name']}... ({attempt}/{max_retries})")
def worker(queue, course_id_folder, course_audio_filename, max_retries): def worker(queue, course_id_folder, course_audio_filename, max_retries):
@@ -80,14 +84,10 @@ def worker(queue, course_id_folder, course_audio_filename, max_retries):
def convert_mp4(mp4_file): def convert_mp4(mp4_file):
try: try:
# 获取 MP4 文件所在的目录
mp4_dir = os.path.dirname(mp4_file) mp4_dir = os.path.dirname(mp4_file)
# 获取 MP4 文件的文件名(不包含扩展名)
mp4_filename = os.path.splitext(os.path.basename(mp4_file))[0] mp4_filename = os.path.splitext(os.path.basename(mp4_file))[0]
# 生成对应的 WAV 文件路径
wav_file = os.path.join(mp4_dir, f"{mp4_filename}.wav") wav_file = os.path.join(mp4_dir, f"{mp4_filename}.wav")
# 构建 FFmpeg 命令
command = [ command = [
'ffmpeg', 'ffmpeg',
'-y', '-y',
@@ -99,56 +99,46 @@ def convert_mp4(mp4_file):
wav_file wav_file
] ]
# 执行 FFmpeg 命令
subprocess.run(command, check=True) subprocess.run(command, check=True)
print(f"成功将 {mp4_file} 转换为 {wav_file}") logging.info(f"成功将 {mp4_file} 转换为 {wav_file}")
return wav_file return wav_file
except subprocess.CalledProcessError as e: except subprocess.CalledProcessError as e:
print(f"转换失败: {e}") logging.error(f"转换失败: {e}")
return None return None
except FileNotFoundError: except FileNotFoundError:
print("未找到 FFmpeg请确保已安装并配置好 FFmpeg 环境。") logging.error("未找到 FFmpeg请确保已安装并配置好 FFmpeg 环境。")
return None return None
def get_course(): def get_course():
# 连接到SQLite数据库
conn = sqlite3.connect('courses.db') conn = sqlite3.connect('courses.db')
cursor = conn.cursor() cursor = conn.cursor()
max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1') # 获取数据库中最大的课程ID max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1')
if max_course_id: if max_course_id:
max_course_id = max_course_id.fetchone()[0] max_course_id = max_course_id.fetchone()[0]
print(f"The maximum course ID is {max_course_id}") logging.info(f"The maximum course ID is {max_course_id}")
else: else:
print("No courses found in the database.") logging.info("No courses found in the database.")
max_course_id = 11 max_course_id = 11
start_course_id = max_course_id - 5 start_course_id = max_course_id - 5
# 查询courses表中的所有课程ID
cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,)) cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,))
# cursor.execute('SELECT id, title FROM courses where id >= ')
course_ids_data = cursor.fetchall() course_ids_data = cursor.fetchall()
print(course_ids_data)
course_ids = [row[0] for row in course_ids_data] course_ids = [row[0] for row in course_ids_data]
course_ids_dict = dict(course_ids_data) course_ids_dict = dict(course_ids_data)
print(course_ids_dict)
print(course_ids)
# 创建json文件夹
if not os.path.exists('json'): if not os.path.exists('json'):
os.makedirs('json') os.makedirs('json')
# 创建course文件夹
if not os.path.exists('course'): if not os.path.exists('course'):
os.makedirs('course') os.makedirs('course')
# 先请求全部的链接获取数据,并将获取到的课程信息保存到数据库中
for course_id in course_ids: for course_id in course_ids:
# course_id = course_id_tuple[0] logging.info(f"Processing course ID: {course_id}")
print(f"Processing course ID: {course_id}")
json_filename = os.path.join('json', f'{course_id}.json') json_filename = os.path.join('json', f'{course_id}.json')
if os.path.exists(json_filename): if os.path.exists(json_filename):
print(f"Course {course_id} JSON file already exists, using local file.") logging.info(f"Course {course_id} JSON file already exists, using local file.")
with open(json_filename, 'r', encoding='utf-8') as json_file: with open(json_filename, 'r', encoding='utf-8') as json_file:
contents_data = json.load(json_file) contents_data = json.load(json_file)
else: else:
@@ -171,7 +161,6 @@ def get_course():
cursor.close() cursor.close()
conn.close() conn.close()
# 现在所有的课程信息都已经保存到数据库中,开始下载附件和进行后续操作
for course_id in course_ids: for course_id in course_ids:
course_id_folder = os.path.join('course', str(course_id)) course_id_folder = os.path.join('course', str(course_id))
@@ -185,27 +174,23 @@ def get_course():
attachment_queue = Queue() attachment_queue = Queue()
# 下载所有附件
for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]: for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]:
attachment_queue.put(attachment) attachment_queue.put(attachment)
# 创建并启动多个下载线程
threads = [] threads = []
for _ in range(max_download_threads): for _ in range(max_download_threads):
t = Thread(target=worker, args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts)) t = Thread(target=worker,
args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts))
t.start() t.start()
threads.append(t) threads.append(t)
# 等待所有下载任务完成
attachment_queue.join() attachment_queue.join()
for t in threads: for t in threads:
t.join() t.join()
# 检查是否存在音频文件
audio_files = [item for item in contents_data['data'] if item['category'] == 'audio'] audio_files = [item for item in contents_data['data'] if item['category'] == 'audio']
if audio_files: if audio_files:
# 合并所有音频文件
audio_files.sort(key=lambda x: x['order']) audio_files.sort(key=lambda x: x['order'])
combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3') combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3')
@@ -221,15 +206,13 @@ def get_course():
shutil.move(combined_audio_filename, course_audio_filename) shutil.move(combined_audio_filename, course_audio_filename)
os.remove(text_file) os.remove(text_file)
# 删除下载的临时音频文件
for item in audio_files: for item in audio_files:
audio_file_path = os.path.join(course_id_folder, item['attachment']['name']) audio_file_path = os.path.join(course_id_folder, item['attachment']['name'])
try: try:
os.remove(audio_file_path) os.remove(audio_file_path)
except: except:
print('delete file fail') logging.error('delete file fail')
# 整理文件
for item in contents_data['data']: for item in contents_data['data']:
attachment = item['attachment'] attachment = item['attachment']
if attachment: if attachment:

Binary file not shown.

View File

@@ -11,8 +11,8 @@ from gradio_client import Client, handle_file
from pydub import AudioSegment from pydub import AudioSegment
from pydub.silence import split_on_silence from pydub.silence import split_on_silence
use_remote_api = False use_remote_api = True
process_workers = 5 if use_remote_api else 2 process_workers = 5 if use_remote_api else 1
config = configparser.ConfigParser() config = configparser.ConfigParser()
config.read('config.ini') config.read('config.ini')
@@ -101,7 +101,7 @@ def process_audio_file(audio_file_path):
def send_request(chunk, index, file_name_without_extension): def send_request(chunk, index, file_name_without_extension):
audio_part_path = os.path.join('media', f"{file_name_without_extension}_chunk_{index}.wav") audio_part_path = os.path.join('media', f"{file_name_without_extension}_chunk_{index}.wav")
chunk.export(audio_part_path, format="wav") chunk.export(audio_part_path, format="wav")
logging.info(f'Exported chunk file {audio_part_path} for {file_name_without_extension}') # logging.info(f'Exported chunk file {audio_part_path} for {file_name_without_extension}')
try: try:
if use_remote_api: if use_remote_api:
multipart_form_data = { multipart_form_data = {
@@ -140,13 +140,18 @@ def send_request(chunk, index, file_name_without_extension):
def main(): def main():
all_files = os.listdir('media') # all_files = os.listdir('media')
audio_files = [file for file in all_files if file.endswith('.wav')] # audio_files = [file for file in all_files if file.endswith('.wav')]
audio_files = []
for root, dirs, files in os.walk('media'):
for file in files:
if file.endswith('.wav'):
audio_files.append(os.path.join(root, file))
print(audio_files) print(audio_files)
with ThreadPoolExecutor(max_workers=process_workers) as executor: with ThreadPoolExecutor(max_workers=process_workers) as executor:
for audio_file in audio_files: for audio_file in audio_files:
audio_file_path = os.path.join('media', audio_file) audio_file_path = os.path.join(audio_file)
executor.submit(process_audio_file, audio_file_path) executor.submit(process_audio_file, audio_file_path)