songyi/course_content_parser.py

# -*- coding: utf-8 -*-
import configparser
import os
import shutil
import sqlite3
import subprocess
from queue import Queue
from threading import Thread

import requests

import json
from headers import headers
from logging_config import setup_logging
from transcribe_media import convert_media

logger = setup_logging()

# 读取配置文件
config = configparser.ConfigParser()
config.read('config.ini')
authorization_token = config['DEFAULT']['authorization_token']
max_download_threads = int(config['DEFAULT']['max_download_threads'])
max_retry_attempts = int(config['DEFAULT']['max_retry_attempts'])
download_id = int(config['DEFAULT']['download_id'])

headers = headers
headers['authorization'] = f'Bearer {authorization_token}'


def download_attachment(attachment, course_id_folder, course_audio_filename, max_retries):
    if attachment['name'] in ["", None] or attachment['name'].endswith(".m3u8"):
        logger.info("字符串为空")
        last_slash_index = attachment['url'].rfind('/')
        download_filename = attachment['url'][last_slash_index + 1:]
    else:
        download_filename = attachment['name']

    attempt = 0
    while attempt < max_retries:
        try:
            url = attachment['url']

            file_extension = attachment['name'].split('.')[-1].lower()
            if file_extension != 'mp3':
                course_id_folder = os.path.join(course_id_folder, file_extension)
            else:
                if os.path.exists(course_audio_filename):
                    logger.info(f"File {course_audio_filename} already exists, skipping download.")
                    return

            filename = os.path.join(course_id_folder, download_filename)

            if os.path.exists(filename):
                logger.info(f"File {filename} already exists, skipping download.")
                return

            command = f'aria2c -o "{filename}" -x 16 -s 16 {url}'
            subprocess.run(command, shell=True, check=True)
            logger.info(f"Download Command: {command}")

            return
        except subprocess.CalledProcessError as e:
            logger.error(f"Failed to download {attachment['name']}: {e}")
            attempt += 1
            if attempt == max_retries:
                logger.error(f"Failed to download {attachment['name']} after {max_retries} attempts.")
            else:
                logger.warning(f"Retrying {attachment['name']}... ({attempt}/{max_retries})")


def worker(queue, course_id_folder, course_audio_filename, max_retries):
    while not queue.empty():
        attachment = queue.get()
        download_attachment(attachment, course_id_folder, course_audio_filename, max_retries)
        queue.task_done()


def fetch_course(courseIds):
    conn = sqlite3.connect('courses.db')
    cursor = conn.cursor()
    max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1')
    if max_course_id:
        max_course_id = max_course_id.fetchone()[0]
        logger.info(f"The maximum course ID is {max_course_id}")
    else:
        logger.info("No courses found in the database.")
        max_course_id = 11

    start_course_id = download_id if max_course_id - 3 < download_id else max_course_id - 3
    if courseIds is None:
        cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,))
    else:
        cursor.execute('SELECT id, title FROM courses WHERE id IN ({})'.format(','.join('?' * len(courseIds))), courseIds)
    course_ids_data = cursor.fetchall()
    course_ids = [row[0] for row in course_ids_data]
    course_ids_dict = dict(course_ids_data)

    if not os.path.exists('json'):
        os.makedirs('json')

    if not os.path.exists('course'):
        os.makedirs('course')

    for course_id in course_ids:
        logger.info(f"Processing course ID: {course_id}")

        json_filename = os.path.join('json', f'{course_id}.json')
        if os.path.exists(json_filename):
            logger.info(f"Course {course_id} JSON file already exists, using local file.")
            with open(json_filename, 'r', encoding='utf-8') as json_file:
                contents_data = json.load(json_file)
        else:
            response = requests.get(f'https://bandu-api.songy.info/v2/courses/{course_id}/contents', headers=headers)
            contents_data = response.json()
            with open(json_filename, 'w', encoding='utf-8') as save_json_file:
                json.dump(contents_data, save_json_file, ensure_ascii=False, indent=4)

        for item in contents_data['data']:
            cursor.execute('''
                           INSERT INTO contents (id, course_id, content, category, audio_order, attachment_url,
                                                 mime_type)
                           VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(id) DO NOTHING
                           ''', (item['id'], course_id, item['content'], item['category'], item['order'],
                                 item['attachment']['url'] if item['attachment'] else None,
                                 item['attachment']['mime_type'] if item['attachment'] else None))
    return course_ids, course_ids_dict


def download_course_contents(course_ids, course_ids_dict):
    if not os.path.exists('json'):
        os.makedirs('json')

    if not os.path.exists('course'):
        os.makedirs('course')

    for course_id in course_ids:
        course_id_folder = os.path.join('course', str(course_id))

        if not os.path.exists(course_id_folder):
            os.makedirs(course_id_folder)

        json_filename = os.path.join('json', f'{course_id}.json')
        with open(json_filename, 'r', encoding='utf-8') as json_file:
            contents_data = json.load(json_file)
        course_audio_filename = os.path.join(course_id_folder, f'{course_ids_dict[course_id]}.mp3')

        attachment_queue = Queue()

        for attachment in [item['attachment'] for item in contents_data['data'] if item['attachment']]:
            attachment_queue.put(attachment)

        threads = []
        for _ in range(max_download_threads):
            t = Thread(target=worker,
                args=(attachment_queue, course_id_folder, course_audio_filename, max_retry_attempts))
            t.start()
            threads.append(t)

        attachment_queue.join()

        for t in threads:
            t.join()

        audio_files = [item for item in contents_data['data'] if item['category'] == 'audio']
        if audio_files:
            audio_files.sort(key=lambda x: x['order'])

            combined_audio_filename = os.path.join(course_id_folder, 'combined_audio.mp3')

            if not os.path.exists(course_audio_filename):
                text_file = os.path.join(course_id_folder, 'input_files.txt')
                with open(text_file, 'w') as f:
                    for audio_file in audio_files:
                        f.write(f"file '{audio_file['attachment']['name']}'\n")

                ffmpeg_command = f'ffmpeg -f concat -safe 0 -i {text_file} -c copy "{combined_audio_filename}"'
                subprocess.run(ffmpeg_command, shell=True)
                shutil.move(combined_audio_filename, course_audio_filename)
                os.remove(text_file)

                # 删除音频文件
                for item in audio_files:
                    audio_file_path = os.path.join(course_id_folder, item['attachment']['name'])
                    try:
                        os.remove(audio_file_path)
                    except:
                        logger.error('delete file fail')

        for item in contents_data['data']:
            attachment = item['attachment']
            if attachment:
                filename = os.path.join(course_id_folder, attachment['name'])
                if os.path.exists(filename):
                    file_extension = attachment['name'].split('.')[-1].lower()
                    folder_name = f"{file_extension}"
                    if folder_name == 'mp3':
                        continue
                    folder_path = os.path.join(course_id_folder, folder_name)
                    if not os.path.exists(folder_path):
                        os.makedirs(folder_path)
                    move_file = os.path.join(folder_path, attachment['name'])
                    shutil.move(filename, move_file)

        # 保存category为text的content到TXT文件
        text_contents = [item['content'] for item in contents_data['data'] if item['category'] == 'text']
        if text_contents:
            with open(os.path.join(course_id_folder, f'{course_id}.txt'), 'w', encoding='utf-8') as txt_file:
                for content in text_contents:
                    txt_file.write(content + '\n')
        # 处理mp4文件
        mp4_folder = os.path.join(course_id_folder, 'mp4')
        mp4_file = None
        exist_md_file = False
        if os.path.exists(mp4_folder):
            # 遍历指定文件夹内的所有文件和子文件夹
            for root, dirs, files in os.walk(mp4_folder):
                for file in files:
                    # 检查是否已经存在
                    if file.lower().endswith('.md'):
                        exist_md_file = True
                    # 检查文件扩展名是否为.mp4
                    if file.lower().endswith('.mp4'):
                        # 构建完整的 MP4 文件路径
                        mp4_file = os.path.join(root, file)
        if (not exist_md_file) and mp4_file is not None:
            convert_media(mp4_file)


def get_course(course_list=None):
    course_ids, course_ids_dict = fetch_course(course_list)
    download_course_contents(course_ids, course_ids_dict)


if __name__ == '__main__':
    # 多语的真相 381, 380, 382, 384, 385, 386, 387, 388, 391, 392, 393, 394, 399, 400, 402, 408, 409, 413, 414, 415, 423, 424, 534, 554
    # 写作课 225, 226, 228, 230, 231, 232, 238, 240, 564, 565, 566, 567, 568, 569, 243
    # 茶话会 87, 88, 89, 90, 91, 92, 93, 94, 97, 147, 177, 186, 190, 213, 219, 224, 235, 242, 263, 278, 289, 297, 305, 383, 398, 410, 418, 433, 505, 545, 582, 620, 638, 651, 664, 676, 686
    # 学员故事 364, 367, 368, 370, 371, 372, 373, 374, 375, 376, 377, 378, 313, 320, 319, 406, 405, 404, 401, 403, 411, 416, 417, 421, 422, 427, 430, 447, 438, 444, 446, 485, 487, 489, 493, 496, 498, 502, 507, 509, 513, 515, 518, 519, 529, 537, 544, 557, 575, 584, 590, 595, 598, 599, 606, 608, 611, 613, 618, 621, 624, 628, 633, 635, 637, 653, 642, 645, 648, 652, 655, 657, 662, 665, 671, 673, 674, 677, 680, 682, 685, 687
    # 相约七年后 484, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 60, 61, 86, 95, 148, 233, 246, 273, 291, 317, 396, 412, 426, 442, 492, 506, 552, 589, 607, 619, 644, 656, 672, 683
    # 笑来分享合集 191, 192, 193, 194, 195, 196, 214, 215, 216, 217, 220, 221, 222, 227, 229, 234, 236, 237, 239, 241, 243, 244, 245, 247, 248, 249, 253, 290, 292, 390, 395, 397, 399, 400, 402, 409, 408, 414, 419, 423, 428, 429, 435, 436, 439, 440, 443, 445, 479, 488, 490, 491, 495, 497, 499, 500, 503, 508, 510, 514, 516, 517, 520, 521, 530, 532, 534, 538, 543, 547, 549, 554, 558, 562, 563, 570, 573, 578, 597, 609, 610, 612, 615, 616, 617, 622, 623, 626, 631, 634, 641, 640, 643, 647, 650, 654, 661, 668, 675, 679
    # 人工智能 585, 586, 587, 588, 591, 592, 593, 594, 601, 602, 604, 605
    # course_list = [585, 586, 587, 588, 591, 592, 593, 594, 601, 602, 604, 605]
    course_list = None
    get_course(course_list)