优化代码
This commit is contained in:
75
course.py
75
course.py
@@ -11,7 +11,17 @@ from threading import Thread
|
|||||||
import requests
|
import requests
|
||||||
|
|
||||||
from headers import headers
|
from headers import headers
|
||||||
|
import logging
|
||||||
from video_voice_process import process_audio_file
|
from video_voice_process import process_audio_file
|
||||||
|
from logging.handlers import RotatingFileHandler
|
||||||
|
|
||||||
|
# 配置日志
|
||||||
|
logging.basicConfig(level=logging.INFO,
|
||||||
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
||||||
|
handlers=[
|
||||||
|
logging.StreamHandler(), # 控制台日志
|
||||||
|
RotatingFileHandler('app.log', maxBytes=1024*1024*5, backupCount=3) # 日志文件
|
||||||
|
])
|
||||||
|
|
||||||
# 读取配置文件
|
# 读取配置文件
|
||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
@@ -26,13 +36,9 @@ headers['authorization'] = f'Bearer {authorization_token}'
|
|||||||
|
|
||||||
def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries):
|
def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries):
|
||||||
if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"):
|
if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"):
|
||||||
print("字符串为空")
|
logging.info("字符串为空")
|
||||||
# 找到最后一个斜杠的位置
|
|
||||||
last_slash_index = attachment['url'].rfind('/')
|
last_slash_index = attachment['url'].rfind('/')
|
||||||
|
|
||||||
# 截取最后一个斜杠之后的所有字符
|
|
||||||
download_filename = attachment['url'][last_slash_index + 1:]
|
download_filename = attachment['url'][last_slash_index + 1:]
|
||||||
print(attachment['url'])
|
|
||||||
else:
|
else:
|
||||||
download_filename = attachment['name']
|
download_filename = attachment['name']
|
||||||
|
|
||||||
@@ -40,35 +46,33 @@ def download_attachment(attachment, course_id_folder, course_audio_filename, max
|
|||||||
while attempt < max_retries:
|
while attempt < max_retries:
|
||||||
try:
|
try:
|
||||||
url = attachment['url']
|
url = attachment['url']
|
||||||
print(download_filename)
|
|
||||||
print(attachment['name'])
|
|
||||||
|
|
||||||
file_extension = attachment['name'].split('.')[-1].lower()
|
file_extension = attachment['name'].split('.')[-1].lower()
|
||||||
if file_extension != 'mp3':
|
if file_extension != 'mp3':
|
||||||
course_id_folder = os.path.join(course_id_folder, file_extension)
|
course_id_folder = os.path.join(course_id_folder, file_extension)
|
||||||
else:
|
else:
|
||||||
if os.path.exists(course_audio_filename):
|
if os.path.exists(course_audio_filename):
|
||||||
print(f"File {course_audio_filename} already exists, skipping download.")
|
logging.info(f"File {course_audio_filename} already exists, skipping download.")
|
||||||
return
|
return
|
||||||
|
|
||||||
filename = os.path.join(course_id_folder, download_filename)
|
filename = os.path.join(course_id_folder, download_filename)
|
||||||
|
|
||||||
if os.path.exists(filename):
|
if os.path.exists(filename):
|
||||||
print(f"File {filename} already exists, skipping download.")
|
logging.info(f"File {filename} already exists, skipping download.")
|
||||||
return
|
return
|
||||||
|
|
||||||
command = f"aria2c -o {filename} -x 16 -s 16 {url}"
|
command = f"aria2c -o {filename} -x 16 -s 16 {url}"
|
||||||
print(command)
|
|
||||||
subprocess.run(command, shell=True, check=True)
|
subprocess.run(command, shell=True, check=True)
|
||||||
|
logging.info(f"Download Command: {command}")
|
||||||
|
|
||||||
return
|
return
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
print(f"Failed to download {attachment['name']}: {e}")
|
logging.error(f"Failed to download {attachment['name']}: {e}")
|
||||||
attempt += 1
|
attempt += 1
|
||||||
if attempt == max_retries:
|
if attempt == max_retries:
|
||||||
print(f"Failed to download {attachment['name']} after {max_retries} attempts.")
|
logging.error(f"Failed to download {attachment['name']} after {max_retries} attempts.")
|
||||||
else:
|
else:
|
||||||
print(f"Retrying {attachment['name']}... ({attempt}/{max_retries})")
|
logging.warning(f"Retrying {attachment['name']}... ({attempt}/{max_retries})")
|
||||||
|
|
||||||
|
|
||||||
def worker(queue, course_id_folder, course_audio_filename, max_retries):
|
def worker(queue, course_id_folder, course_audio_filename, max_retries):
|
||||||
@@ -80,14 +84,10 @@ def worker(queue, course_id_folder, course_audio_filename, max_retries):
|
|||||||
|
|
||||||
def convert_mp4(mp4_file):
|
def convert_mp4(mp4_file):
|
||||||
try:
|
try:
|
||||||
# 获取 MP4 文件所在的目录
|
|
||||||
mp4_dir = os.path.dirname(mp4_file)
|
mp4_dir = os.path.dirname(mp4_file)
|
||||||
# 获取 MP4 文件的文件名(不包含扩展名)
|
|
||||||
mp4_filename = os.path.splitext(os.path.basename(mp4_file))[0]
|
mp4_filename = os.path.splitext(os.path.basename(mp4_file))[0]
|
||||||
# 生成对应的 WAV 文件路径
|
|
||||||
wav_file = os.path.join(mp4_dir, f"{mp4_filename}.wav")
|
wav_file = os.path.join(mp4_dir, f"{mp4_filename}.wav")
|
||||||
|
|
||||||
# 构建 FFmpeg 命令
|
|
||||||
command = [
|
command = [
|
||||||
'ffmpeg',
|
'ffmpeg',
|
||||||
'-y',
|
'-y',
|
||||||
@@ -99,56 +99,46 @@ def convert_mp4(mp4_file):
|
|||||||
wav_file
|
wav_file
|
||||||
]
|
]
|
||||||
|
|
||||||
# 执行 FFmpeg 命令
|
|
||||||
subprocess.run(command, check=True)
|
subprocess.run(command, check=True)
|
||||||
print(f"成功将 {mp4_file} 转换为 {wav_file}")
|
logging.info(f"成功将 {mp4_file} 转换为 {wav_file}")
|
||||||
return wav_file
|
return wav_file
|
||||||
except subprocess.CalledProcessError as e:
|
except subprocess.CalledProcessError as e:
|
||||||
print(f"转换失败: {e}")
|
logging.error(f"转换失败: {e}")
|
||||||
return None
|
return None
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
print("未找到 FFmpeg,请确保已安装并配置好 FFmpeg 环境。")
|
logging.error("未找到 FFmpeg,请确保已安装并配置好 FFmpeg 环境。")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_course():
|
def get_course():
|
||||||
# 连接到SQLite数据库
|
|
||||||
conn = sqlite3.connect('courses.db')
|
conn = sqlite3.connect('courses.db')
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1') # 获取数据库中最大的课程ID
|
max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1')
|
||||||
if max_course_id:
|
if max_course_id:
|
||||||
max_course_id = max_course_id.fetchone()[0]
|
max_course_id = max_course_id.fetchone()[0]
|
||||||
print(f"The maximum course ID is {max_course_id}")
|
logging.info(f"The maximum course ID is {max_course_id}")
|
||||||
else:
|
else:
|
||||||
print("No courses found in the database.")
|
logging.info("No courses found in the database.")
|
||||||
max_course_id = 11
|
max_course_id = 11
|
||||||
start_course_id = max_course_id - 5
|
start_course_id = max_course_id - 5
|
||||||
|
|
||||||
# 查询courses表中的所有课程ID
|
|
||||||
cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,))
|
cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,))
|
||||||
# cursor.execute('SELECT id, title FROM courses where id >= ')
|
|
||||||
course_ids_data = cursor.fetchall()
|
course_ids_data = cursor.fetchall()
|
||||||
print(course_ids_data)
|
|
||||||
course_ids = [row[0] for row in course_ids_data]
|
course_ids = [row[0] for row in course_ids_data]
|
||||||
course_ids_dict = dict(course_ids_data)
|
course_ids_dict = dict(course_ids_data)
|
||||||
print(course_ids_dict)
|
|
||||||
print(course_ids)
|
|
||||||
|
|
||||||
# 创建json文件夹
|
|
||||||
if not os.path.exists('json'):
|
if not os.path.exists('json'):
|
||||||
os.makedirs('json')
|
os.makedirs('json')
|
||||||
|
|
||||||
# 创建course文件夹
|
|
||||||
if not os.path.exists('course'):
|
if not os.path.exists('course'):
|
||||||
os.makedirs('course')
|
os.makedirs('course')
|
||||||
|
|
||||||
# 先请求全部的链接获取数据,并将获取到的课程信息保存到数据库中
|
|
||||||
for course_id in course_ids:
|
for course_id in course_ids:
|
||||||
# course_id = course_id_tuple[0]
|
logging.info(f"Processing course ID: {course_id}")
|
||||||
print(f"Processing course ID: {course_id}")
|
|
||||||
|
|
||||||
json_filename = os.path.join('json', f'{course_id}.json')
|
json_filename = os.path.join('json', f'{course_id}.json')
|
||||||
if os.path.exists(json_filename):
|
if os.path.exists(json_filename):
|
||||||
print(f"Course {course_id} JSON file already exists, using local file.")
|
logging.info(f"Course {course_id} JSON file already exists, using local file.")
|
||||||
with open(json_filename, 'r', encoding='utf-8') as json_file:
|
with open(json_filename, 'r', encoding='utf-8') as json_file:
|
||||||
contents_data = json.load(json_file)
|
contents_data = json.load(json_file)
|
||||||
else:
|
else:
|
||||||
@@ -171,7 +161,6 @@ def get_course():
|
|||||||
cursor.close()
|
cursor.close()
|
||||||
conn.close()
|
conn.close()
|
||||||
|
|
||||||
# 现在所有的课程信息都已经保存到数据库中,开始下载附件和进行后续操作
|
|
||||||
for course_id in course_ids:
|
for course_id in course_ids:
|
||||||
course_id_folder = os.path.join('course', str(course_id))
|
course_id_folder = os.path.join('course', str(course_id))
|
||||||
|
|
||||||
@@ -185,27 +174,23 @@ def get_course():
|
|||||||
|
|
||||||
attachment_queue = Queue()
|
attachment_queue = Queue()
|
||||||
|
|
||||||
# 下载所有附件
|
|
||||||
for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]:
|
for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]:
|
||||||
attachment_queue.put(attachment)
|
attachment_queue.put(attachment)
|
||||||
|
|
||||||
# 创建并启动多个下载线程
|
|
||||||
threads = []
|
threads = []
|
||||||
for _ in range(max_download_threads):
|
for _ in range(max_download_threads):
|
||||||
t = Thread(target=worker, args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts))
|
t = Thread(target=worker,
|
||||||
|
args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts))
|
||||||
t.start()
|
t.start()
|
||||||
threads.append(t)
|
threads.append(t)
|
||||||
|
|
||||||
# 等待所有下载任务完成
|
|
||||||
attachment_queue.join()
|
attachment_queue.join()
|
||||||
|
|
||||||
for t in threads:
|
for t in threads:
|
||||||
t.join()
|
t.join()
|
||||||
|
|
||||||
# 检查是否存在音频文件
|
|
||||||
audio_files = [item for item in contents_data['data'] if item['category'] == 'audio']
|
audio_files = [item for item in contents_data['data'] if item['category'] == 'audio']
|
||||||
if audio_files:
|
if audio_files:
|
||||||
# 合并所有音频文件
|
|
||||||
audio_files.sort(key=lambda x: x['order'])
|
audio_files.sort(key=lambda x: x['order'])
|
||||||
|
|
||||||
combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3')
|
combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3')
|
||||||
@@ -221,15 +206,13 @@ def get_course():
|
|||||||
shutil.move(combined_audio_filename, course_audio_filename)
|
shutil.move(combined_audio_filename, course_audio_filename)
|
||||||
os.remove(text_file)
|
os.remove(text_file)
|
||||||
|
|
||||||
# 删除下载的临时音频文件
|
|
||||||
for item in audio_files:
|
for item in audio_files:
|
||||||
audio_file_path = os.path.join(course_id_folder, item['attachment']['name'])
|
audio_file_path = os.path.join(course_id_folder, item['attachment']['name'])
|
||||||
try:
|
try:
|
||||||
os.remove(audio_file_path)
|
os.remove(audio_file_path)
|
||||||
except:
|
except:
|
||||||
print('delete file fail')
|
logging.error('delete file fail')
|
||||||
|
|
||||||
# 整理文件
|
|
||||||
for item in contents_data['data']:
|
for item in contents_data['data']:
|
||||||
attachment = item['attachment']
|
attachment = item['attachment']
|
||||||
if attachment:
|
if attachment:
|
||||||
|
|||||||
BIN
courses.db
BIN
courses.db
Binary file not shown.
@@ -11,8 +11,8 @@ from gradio_client import Client, handle_file
|
|||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
from pydub.silence import split_on_silence
|
from pydub.silence import split_on_silence
|
||||||
|
|
||||||
use_remote_api = False
|
use_remote_api = True
|
||||||
process_workers = 5 if use_remote_api else 2
|
process_workers = 5 if use_remote_api else 1
|
||||||
|
|
||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
config.read('config.ini')
|
config.read('config.ini')
|
||||||
@@ -101,7 +101,7 @@ def process_audio_file(audio_file_path):
|
|||||||
def send_request(chunk, index, file_name_without_extension):
|
def send_request(chunk, index, file_name_without_extension):
|
||||||
audio_part_path = os.path.join('media', f"{file_name_without_extension}_chunk_{index}.wav")
|
audio_part_path = os.path.join('media', f"{file_name_without_extension}_chunk_{index}.wav")
|
||||||
chunk.export(audio_part_path, format="wav")
|
chunk.export(audio_part_path, format="wav")
|
||||||
logging.info(f'Exported chunk file {audio_part_path} for {file_name_without_extension}')
|
# logging.info(f'Exported chunk file {audio_part_path} for {file_name_without_extension}')
|
||||||
try:
|
try:
|
||||||
if use_remote_api:
|
if use_remote_api:
|
||||||
multipart_form_data = {
|
multipart_form_data = {
|
||||||
@@ -140,13 +140,18 @@ def send_request(chunk, index, file_name_without_extension):
|
|||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
all_files = os.listdir('media')
|
# all_files = os.listdir('media')
|
||||||
audio_files = [file for file in all_files if file.endswith('.wav')]
|
# audio_files = [file for file in all_files if file.endswith('.wav')]
|
||||||
|
audio_files = []
|
||||||
|
for root, dirs, files in os.walk('media'):
|
||||||
|
for file in files:
|
||||||
|
if file.endswith('.wav'):
|
||||||
|
audio_files.append(os.path.join(root, file))
|
||||||
print(audio_files)
|
print(audio_files)
|
||||||
|
|
||||||
with ThreadPoolExecutor(max_workers=process_workers) as executor:
|
with ThreadPoolExecutor(max_workers=process_workers) as executor:
|
||||||
for audio_file in audio_files:
|
for audio_file in audio_files:
|
||||||
audio_file_path = os.path.join('media', audio_file)
|
audio_file_path = os.path.join(audio_file)
|
||||||
executor.submit(process_audio_file, audio_file_path)
|
executor.submit(process_audio_file, audio_file_path)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user