274 lines
11 KiB
Python
Executable File
274 lines
11 KiB
Python
Executable File
# -*- coding: utf-8 -*-
|
||
import configparser
|
||
import json
|
||
import os
|
||
import shutil
|
||
import sqlite3
|
||
import subprocess
|
||
from queue import Queue
|
||
from threading import Thread
|
||
|
||
import requests
|
||
|
||
from headers import headers
|
||
from video_voice_process import process_audio_file
|
||
|
||
# 读取配置文件
|
||
config = configparser.ConfigParser()
|
||
config.read('config.ini')
|
||
authorization_token = config['DEFAULT']['authorization_token']
|
||
max_download_threads = int(config['DEFAULT']['max_download_threads'])
|
||
max_retry_attempts = int(config['DEFAULT']['max_retry_attempts'])
|
||
|
||
headers = headers
|
||
headers['authorization'] = f'Bearer {authorization_token}'
|
||
|
||
|
||
def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries):
|
||
if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"):
|
||
print("字符串为空")
|
||
# 找到最后一个斜杠的位置
|
||
last_slash_index = attachment['url'].rfind('/')
|
||
|
||
# 截取最后一个斜杠之后的所有字符
|
||
download_filename = attachment['url'][last_slash_index + 1:]
|
||
print(attachment['url'])
|
||
else:
|
||
download_filename = attachment['name']
|
||
|
||
attempt = 0
|
||
while attempt < max_retries:
|
||
try:
|
||
url = attachment['url']
|
||
print(download_filename)
|
||
print(attachment['name'])
|
||
|
||
file_extension = attachment['name'].split('.')[-1].lower()
|
||
if file_extension != 'mp3':
|
||
course_id_folder = os.path.join(course_id_folder, file_extension)
|
||
else:
|
||
if os.path.exists(course_audio_filename):
|
||
print(f"File {course_audio_filename} already exists, skipping download.")
|
||
return
|
||
|
||
filename = os.path.join(course_id_folder, download_filename)
|
||
|
||
if os.path.exists(filename):
|
||
print(f"File {filename} already exists, skipping download.")
|
||
return
|
||
|
||
command = f"aria2c -o {filename} -x 16 -s 16 {url}"
|
||
print(command)
|
||
subprocess.run(command, shell=True, check=True)
|
||
|
||
return
|
||
except subprocess.CalledProcessError as e:
|
||
print(f"Failed to download {attachment['name']}: {e}")
|
||
attempt += 1
|
||
if attempt == max_retries:
|
||
print(f"Failed to download {attachment['name']} after {max_retries} attempts.")
|
||
else:
|
||
print(f"Retrying {attachment['name']}... ({attempt}/{max_retries})")
|
||
|
||
|
||
def worker(queue, course_id_folder, course_audio_filename, max_retries):
|
||
while not queue.empty():
|
||
attachment = queue.get()
|
||
download_attachment(attachment, course_id_folder, course_audio_filename, max_retries)
|
||
queue.task_done()
|
||
|
||
|
||
def convert_mp4(mp4_file):
|
||
try:
|
||
# 获取 MP4 文件所在的目录
|
||
mp4_dir = os.path.dirname(mp4_file)
|
||
# 获取 MP4 文件的文件名(不包含扩展名)
|
||
mp4_filename = os.path.splitext(os.path.basename(mp4_file))[0]
|
||
# 生成对应的 WAV 文件路径
|
||
wav_file = os.path.join(mp4_dir, f"{mp4_filename}.wav")
|
||
|
||
# 构建 FFmpeg 命令
|
||
command = [
|
||
'ffmpeg',
|
||
'-y',
|
||
'-i', mp4_file,
|
||
'-vn', # 去除视频流
|
||
'-acodec', 'pcm_s16le', # 使用 PCM 16 位有符号小端编码
|
||
'-ar', '44100', # 设置采样率为 44100 Hz
|
||
'-ac', '2', # 设置声道数为 2(立体声)
|
||
wav_file
|
||
]
|
||
|
||
# 执行 FFmpeg 命令
|
||
subprocess.run(command, check=True)
|
||
print(f"成功将 {mp4_file} 转换为 {wav_file}")
|
||
return wav_file
|
||
except subprocess.CalledProcessError as e:
|
||
print(f"转换失败: {e}")
|
||
return None
|
||
except FileNotFoundError:
|
||
print("未找到 FFmpeg,请确保已安装并配置好 FFmpeg 环境。")
|
||
return None
|
||
|
||
def get_course():
|
||
# 连接到SQLite数据库
|
||
conn = sqlite3.connect('courses.db')
|
||
cursor = conn.cursor()
|
||
max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1') # 获取数据库中最大的课程ID
|
||
if max_course_id:
|
||
max_course_id = max_course_id.fetchone()[0]
|
||
print(f"The maximum course ID is {max_course_id}")
|
||
else:
|
||
print("No courses found in the database.")
|
||
max_course_id = 11
|
||
start_course_id = max_course_id - 5
|
||
|
||
# 查询courses表中的所有课程ID
|
||
cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,))
|
||
# cursor.execute('SELECT id, title FROM courses where id >= ')
|
||
course_ids_data = cursor.fetchall()
|
||
print(course_ids_data)
|
||
course_ids = [row[0] for row in course_ids_data]
|
||
course_ids_dict = dict(course_ids_data)
|
||
print(course_ids_dict)
|
||
print(course_ids)
|
||
|
||
# 创建json文件夹
|
||
if not os.path.exists('json'):
|
||
os.makedirs('json')
|
||
|
||
# 创建course文件夹
|
||
if not os.path.exists('course'):
|
||
os.makedirs('course')
|
||
|
||
# 先请求全部的链接获取数据,并将获取到的课程信息保存到数据库中
|
||
for course_id in course_ids:
|
||
# course_id = course_id_tuple[0]
|
||
print(f"Processing course ID: {course_id}")
|
||
|
||
json_filename = os.path.join('json', f'{course_id}.json')
|
||
if os.path.exists(json_filename):
|
||
print(f"Course {course_id} JSON file already exists, using local file.")
|
||
with open(json_filename, 'r', encoding='utf-8') as json_file:
|
||
contents_data = json.load(json_file)
|
||
else:
|
||
response = requests.get(f'https://bandu-api.songy.info/v2/courses/{course_id}/contents', headers=headers)
|
||
contents_data = response.json()
|
||
with open(json_filename, 'w', encoding='utf-8') as json_file:
|
||
json.dump(contents_data, json_file, ensure_ascii=False, indent=4)
|
||
|
||
for item in contents_data['data']:
|
||
cursor.execute('''
|
||
INSERT INTO contents (id, course_id, content, category, audio_order, attachment_url, mime_type)
|
||
VALUES (?, ?, ?, ?, ?, ?, ?)
|
||
ON CONFLICT(id) DO NOTHING
|
||
''', (item['id'], course_id, item['content'], item['category'], item['order'],
|
||
item['attachment']['url'] if item['attachment'] else None,
|
||
item['attachment']['mime_type'] if item['attachment'] else None))
|
||
|
||
conn.commit()
|
||
|
||
cursor.close()
|
||
conn.close()
|
||
|
||
# 现在所有的课程信息都已经保存到数据库中,开始下载附件和进行后续操作
|
||
for course_id in course_ids:
|
||
course_id_folder = os.path.join('course', str(course_id))
|
||
|
||
if not os.path.exists(course_id_folder):
|
||
os.makedirs(course_id_folder)
|
||
|
||
json_filename = os.path.join('json', f'{course_id}.json')
|
||
with open(json_filename, 'r', encoding='utf-8') as json_file:
|
||
contents_data = json.load(json_file)
|
||
course_audio_filename = os.path.join(course_id_folder, f'{course_ids_dict[course_id]}.mp3')
|
||
|
||
attachment_queue = Queue()
|
||
|
||
# 下载所有附件
|
||
for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]:
|
||
attachment_queue.put(attachment)
|
||
|
||
# 创建并启动多个下载线程
|
||
threads = []
|
||
for _ in range(max_download_threads):
|
||
t = Thread(target=worker, args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts))
|
||
t.start()
|
||
threads.append(t)
|
||
|
||
# 等待所有下载任务完成
|
||
attachment_queue.join()
|
||
|
||
for t in threads:
|
||
t.join()
|
||
|
||
# 检查是否存在音频文件
|
||
audio_files = [item for item in contents_data['data'] if item['category'] == 'audio']
|
||
if audio_files:
|
||
# 合并所有音频文件
|
||
audio_files.sort(key=lambda x: x['order'])
|
||
|
||
combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3')
|
||
|
||
if not os.path.exists(course_audio_filename):
|
||
text_file = os.path.join(course_id_folder, 'input_files.txt')
|
||
with open(text_file, 'w') as f:
|
||
for audio_file in audio_files:
|
||
f.write(f"file '{audio_file['attachment']['name']}'\n")
|
||
|
||
ffmpeg_command = f"ffmpeg -f concat -safe 0 -i {text_file} -c copy {combined_audio_filename}"
|
||
subprocess.run(ffmpeg_command, shell=True)
|
||
shutil.move(combined_audio_filename, course_audio_filename)
|
||
os.remove(text_file)
|
||
|
||
# 删除下载的临时音频文件
|
||
for item in audio_files:
|
||
audio_file_path = os.path.join(course_id_folder, item['attachment']['name'])
|
||
try:
|
||
os.remove(audio_file_path)
|
||
except:
|
||
print('delete file fail')
|
||
|
||
# 整理文件
|
||
for item in contents_data['data']:
|
||
attachment = item['attachment']
|
||
if attachment:
|
||
filename = os.path.join(course_id_folder, attachment['name'])
|
||
if os.path.exists(filename):
|
||
file_extension = attachment['name'].split('.')[-1].lower()
|
||
folder_name = f"{file_extension}"
|
||
if folder_name == 'mp3':
|
||
continue
|
||
folder_path = os.path.join(course_id_folder, folder_name)
|
||
if not os.path.exists(folder_path):
|
||
os.makedirs(folder_path)
|
||
move_file = os.path.join(folder_path, attachment['name']);
|
||
shutil.move(filename, move_file)
|
||
|
||
# 保存category为text的content到TXT文件
|
||
text_contents = [item['content'] for item in contents_data['data'] if item['category'] == 'text']
|
||
if text_contents:
|
||
with open(os.path.join(course_id_folder, f'{course_id}.txt'), 'w', encoding='utf-8') as txt_file:
|
||
for content in text_contents:
|
||
txt_file.write(content + '\n')
|
||
# 处理mp4文件
|
||
mp4_folder = os.path.join(course_id_folder, 'mp4')
|
||
if os.path.exists(mp4_folder):
|
||
# 遍历指定文件夹内的所有文件和子文件夹
|
||
for root, dirs, files in os.walk(mp4_folder):
|
||
for file in files:
|
||
# 检查文件扩展名是否为.mp4
|
||
if file.lower().endswith('.mp4'):
|
||
# 构建完整的 MP4 文件路径
|
||
mp4_file = os.path.join(root, file)
|
||
# 调用 mp4_to_wav 函数进行转换
|
||
wav_file = convert_mp4(mp4_file)
|
||
if wav_file is not None:
|
||
try:
|
||
process_audio_file(wav_file)
|
||
except:
|
||
print('process_audio_file fail')
|
||
|
||
|
||
if __name__ == '__main__':
|
||
get_course() |