优化代码
This commit is contained in:
75
course.py
75
course.py
@@ -11,7 +11,17 @@ from threading import Thread
|
||||
import requests
|
||||
|
||||
from headers import headers
|
||||
import logging
|
||||
from video_voice_process import process_audio_file
|
||||
from logging.handlers import RotatingFileHandler
|
||||
|
||||
# 配置日志
|
||||
logging.basicConfig(level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||
handlers=[
|
||||
logging.StreamHandler(), # 控制台日志
|
||||
RotatingFileHandler('app.log', maxBytes=1024*1024*5, backupCount=3) # 日志文件
|
||||
])
|
||||
|
||||
# 读取配置文件
|
||||
config = configparser.ConfigParser()
|
||||
@@ -26,13 +36,9 @@ headers['authorization'] = f'Bearer {authorization_token}'
|
||||
|
||||
def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries):
|
||||
if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"):
|
||||
print("字符串为空")
|
||||
# 找到最后一个斜杠的位置
|
||||
logging.info("字符串为空")
|
||||
last_slash_index = attachment['url'].rfind('/')
|
||||
|
||||
# 截取最后一个斜杠之后的所有字符
|
||||
download_filename = attachment['url'][last_slash_index + 1:]
|
||||
print(attachment['url'])
|
||||
else:
|
||||
download_filename = attachment['name']
|
||||
|
||||
@@ -40,35 +46,33 @@ def download_attachment(attachment, course_id_folder, course_audio_filename, max
|
||||
while attempt < max_retries:
|
||||
try:
|
||||
url = attachment['url']
|
||||
print(download_filename)
|
||||
print(attachment['name'])
|
||||
|
||||
file_extension = attachment['name'].split('.')[-1].lower()
|
||||
if file_extension != 'mp3':
|
||||
course_id_folder = os.path.join(course_id_folder, file_extension)
|
||||
else:
|
||||
if os.path.exists(course_audio_filename):
|
||||
print(f"File {course_audio_filename} already exists, skipping download.")
|
||||
logging.info(f"File {course_audio_filename} already exists, skipping download.")
|
||||
return
|
||||
|
||||
filename = os.path.join(course_id_folder, download_filename)
|
||||
|
||||
if os.path.exists(filename):
|
||||
print(f"File {filename} already exists, skipping download.")
|
||||
logging.info(f"File {filename} already exists, skipping download.")
|
||||
return
|
||||
|
||||
command = f"aria2c -o {filename} -x 16 -s 16 {url}"
|
||||
print(command)
|
||||
subprocess.run(command, shell=True, check=True)
|
||||
logging.info(f"Download Command: {command}")
|
||||
|
||||
return
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Failed to download {attachment['name']}: {e}")
|
||||
logging.error(f"Failed to download {attachment['name']}: {e}")
|
||||
attempt += 1
|
||||
if attempt == max_retries:
|
||||
print(f"Failed to download {attachment['name']} after {max_retries} attempts.")
|
||||
logging.error(f"Failed to download {attachment['name']} after {max_retries} attempts.")
|
||||
else:
|
||||
print(f"Retrying {attachment['name']}... ({attempt}/{max_retries})")
|
||||
logging.warning(f"Retrying {attachment['name']}... ({attempt}/{max_retries})")
|
||||
|
||||
|
||||
def worker(queue, course_id_folder, course_audio_filename, max_retries):
|
||||
@@ -80,14 +84,10 @@ def worker(queue, course_id_folder, course_audio_filename, max_retries):
|
||||
|
||||
def convert_mp4(mp4_file):
|
||||
try:
|
||||
# 获取 MP4 文件所在的目录
|
||||
mp4_dir = os.path.dirname(mp4_file)
|
||||
# 获取 MP4 文件的文件名(不包含扩展名)
|
||||
mp4_filename = os.path.splitext(os.path.basename(mp4_file))[0]
|
||||
# 生成对应的 WAV 文件路径
|
||||
wav_file = os.path.join(mp4_dir, f"{mp4_filename}.wav")
|
||||
|
||||
# 构建 FFmpeg 命令
|
||||
command = [
|
||||
'ffmpeg',
|
||||
'-y',
|
||||
@@ -99,56 +99,46 @@ def convert_mp4(mp4_file):
|
||||
wav_file
|
||||
]
|
||||
|
||||
# 执行 FFmpeg 命令
|
||||
subprocess.run(command, check=True)
|
||||
print(f"成功将 {mp4_file} 转换为 {wav_file}")
|
||||
logging.info(f"成功将 {mp4_file} 转换为 {wav_file}")
|
||||
return wav_file
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"转换失败: {e}")
|
||||
logging.error(f"转换失败: {e}")
|
||||
return None
|
||||
except FileNotFoundError:
|
||||
print("未找到 FFmpeg,请确保已安装并配置好 FFmpeg 环境。")
|
||||
logging.error("未找到 FFmpeg,请确保已安装并配置好 FFmpeg 环境。")
|
||||
return None
|
||||
|
||||
|
||||
def get_course():
|
||||
# 连接到SQLite数据库
|
||||
conn = sqlite3.connect('courses.db')
|
||||
cursor = conn.cursor()
|
||||
max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1') # 获取数据库中最大的课程ID
|
||||
max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1')
|
||||
if max_course_id:
|
||||
max_course_id = max_course_id.fetchone()[0]
|
||||
print(f"The maximum course ID is {max_course_id}")
|
||||
logging.info(f"The maximum course ID is {max_course_id}")
|
||||
else:
|
||||
print("No courses found in the database.")
|
||||
logging.info("No courses found in the database.")
|
||||
max_course_id = 11
|
||||
start_course_id = max_course_id - 5
|
||||
|
||||
# 查询courses表中的所有课程ID
|
||||
cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,))
|
||||
# cursor.execute('SELECT id, title FROM courses where id >= ')
|
||||
course_ids_data = cursor.fetchall()
|
||||
print(course_ids_data)
|
||||
course_ids = [row[0] for row in course_ids_data]
|
||||
course_ids_dict = dict(course_ids_data)
|
||||
print(course_ids_dict)
|
||||
print(course_ids)
|
||||
|
||||
# 创建json文件夹
|
||||
if not os.path.exists('json'):
|
||||
os.makedirs('json')
|
||||
|
||||
# 创建course文件夹
|
||||
if not os.path.exists('course'):
|
||||
os.makedirs('course')
|
||||
|
||||
# 先请求全部的链接获取数据,并将获取到的课程信息保存到数据库中
|
||||
for course_id in course_ids:
|
||||
# course_id = course_id_tuple[0]
|
||||
print(f"Processing course ID: {course_id}")
|
||||
logging.info(f"Processing course ID: {course_id}")
|
||||
|
||||
json_filename = os.path.join('json', f'{course_id}.json')
|
||||
if os.path.exists(json_filename):
|
||||
print(f"Course {course_id} JSON file already exists, using local file.")
|
||||
logging.info(f"Course {course_id} JSON file already exists, using local file.")
|
||||
with open(json_filename, 'r', encoding='utf-8') as json_file:
|
||||
contents_data = json.load(json_file)
|
||||
else:
|
||||
@@ -171,7 +161,6 @@ def get_course():
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
# 现在所有的课程信息都已经保存到数据库中,开始下载附件和进行后续操作
|
||||
for course_id in course_ids:
|
||||
course_id_folder = os.path.join('course', str(course_id))
|
||||
|
||||
@@ -185,27 +174,23 @@ def get_course():
|
||||
|
||||
attachment_queue = Queue()
|
||||
|
||||
# 下载所有附件
|
||||
for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]:
|
||||
attachment_queue.put(attachment)
|
||||
|
||||
# 创建并启动多个下载线程
|
||||
threads = []
|
||||
for _ in range(max_download_threads):
|
||||
t = Thread(target=worker, args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts))
|
||||
t = Thread(target=worker,
|
||||
args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts))
|
||||
t.start()
|
||||
threads.append(t)
|
||||
|
||||
# 等待所有下载任务完成
|
||||
attachment_queue.join()
|
||||
|
||||
for t in threads:
|
||||
t.join()
|
||||
|
||||
# 检查是否存在音频文件
|
||||
audio_files = [item for item in contents_data['data'] if item['category'] == 'audio']
|
||||
if audio_files:
|
||||
# 合并所有音频文件
|
||||
audio_files.sort(key=lambda x: x['order'])
|
||||
|
||||
combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3')
|
||||
@@ -221,15 +206,13 @@ def get_course():
|
||||
shutil.move(combined_audio_filename, course_audio_filename)
|
||||
os.remove(text_file)
|
||||
|
||||
# 删除下载的临时音频文件
|
||||
for item in audio_files:
|
||||
audio_file_path = os.path.join(course_id_folder, item['attachment']['name'])
|
||||
try:
|
||||
os.remove(audio_file_path)
|
||||
except:
|
||||
print('delete file fail')
|
||||
logging.error('delete file fail')
|
||||
|
||||
# 整理文件
|
||||
for item in contents_data['data']:
|
||||
attachment = item['attachment']
|
||||
if attachment:
|
||||
|
||||
BIN
courses.db
BIN
courses.db
Binary file not shown.
@@ -11,8 +11,8 @@ from gradio_client import Client, handle_file
|
||||
from pydub import AudioSegment
|
||||
from pydub.silence import split_on_silence
|
||||
|
||||
use_remote_api = False
|
||||
process_workers = 5 if use_remote_api else 2
|
||||
use_remote_api = True
|
||||
process_workers = 5 if use_remote_api else 1
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
config.read('config.ini')
|
||||
@@ -101,7 +101,7 @@ def process_audio_file(audio_file_path):
|
||||
def send_request(chunk, index, file_name_without_extension):
|
||||
audio_part_path = os.path.join('media', f"{file_name_without_extension}_chunk_{index}.wav")
|
||||
chunk.export(audio_part_path, format="wav")
|
||||
logging.info(f'Exported chunk file {audio_part_path} for {file_name_without_extension}')
|
||||
# logging.info(f'Exported chunk file {audio_part_path} for {file_name_without_extension}')
|
||||
try:
|
||||
if use_remote_api:
|
||||
multipart_form_data = {
|
||||
@@ -140,13 +140,18 @@ def send_request(chunk, index, file_name_without_extension):
|
||||
|
||||
|
||||
def main():
|
||||
all_files = os.listdir('media')
|
||||
audio_files = [file for file in all_files if file.endswith('.wav')]
|
||||
# all_files = os.listdir('media')
|
||||
# audio_files = [file for file in all_files if file.endswith('.wav')]
|
||||
audio_files = []
|
||||
for root, dirs, files in os.walk('media'):
|
||||
for file in files:
|
||||
if file.endswith('.wav'):
|
||||
audio_files.append(os.path.join(root, file))
|
||||
print(audio_files)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=process_workers) as executor:
|
||||
for audio_file in audio_files:
|
||||
audio_file_path = os.path.join('media', audio_file)
|
||||
audio_file_path = os.path.join(audio_file)
|
||||
executor.submit(process_audio_file, audio_file_path)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user