update

2025-06-14 08:14:15 +08:00
parent 3d488600b5
commit 73bdba7440
4 changed files with 128 additions and 29 deletions
--- a/config.ini
+++ b/config.ini
@@ -6,8 +6,7 @@ download_id = 663

 max_download_threads = 5
 max_retry_attempts = 3
-authorization_token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhdWQiOiIxMDAwMDgzNDciLCJleHAiOjE3NDc5MTAyNjgsImp0aSI6IjA2MTNjN2NhLWMxYjUtNGYwZi1iMDZhLTJiZGE5YzAyNDBmZCIsImlhdCI6MTc0NTMxODI2OCwiaXNzIjoiYXBwdXNlciIsInVpZCI6ImJlMmViOGIyLTFhOTItNGVmMC05ZDAwLTA1YTlkN2E2OWRiMiIsInNjaGVtZSI6Imp3dGhzIiwic2lkIjoiZDBlMjZkNzItYjg0Yi00Y2IxLWJkYWQtN2ZhYWZlZTIxZTFjIn0.aWAc71N1gDqw_QPZAJRe9Tn0EAcTvDgVXBSZAUGM0Z8
-
+authorization_token=eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJhdWQiOiIxMDAwMDgzNDciLCJleHAiOjE3NjI4NTg3NzIsImp0aSI6ImNjZDU2MTAyLTY1NmQtNDFlMS1iNDhjLWQ3OThlMjBiYjViNiIsImlhdCI6MTc0NzMwNjc3MiwiaXNzIjoiYXBwdXNlciIsInVpZCI6ImJlMmViOGIyLTFhOTItNGVmMC05ZDAwLTA1YTlkN2E2OWRiMiIsInNjaGVtZSI6Imp3dGhzIiwic2lkIjoiMzZmOWNiNjctYjIzZi00Y2VhLTgwYWMtMDY1MWY4Mzc3NzZjIn0.qUbPigGOsS399BfDnsPedNLpxwjJtY71TKBzSV1oPVw

 voice2txt_url=https://api.siliconflow.cn/v1/audio/transcriptions
 voice_token=sk-vksrlpckcpttnpjgftpgwytmiipjmvhyzmnffhbhjpahbfiq
--- a/course_content_parser.py
+++ b/course_content_parser.py
@@ -76,7 +76,7 @@ def worker(queue, course_id_folder, course_audio_filename, max_retries):
        queue.task_done()


-def fetch_course():
+def fetch_course(courseIds):
    conn = sqlite3.connect('courses.db')
    cursor = conn.cursor()
    max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1')
@@ -88,8 +88,10 @@ def fetch_course():
        max_course_id = 11

    start_course_id = download_id if max_course_id - 5 < download_id else max_course_id - 5
-
+    if courseIds is None:
        cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,))
+    else:
+        cursor.execute('SELECT id, title FROM courses WHERE id IN ({})'.format(','.join('?' * len(courseIds))), courseIds)
    course_ids_data = cursor.fetchall()
    course_ids = [row[0] for row in course_ids_data]
    course_ids_dict = dict(course_ids_data)
@@ -116,9 +118,9 @@ def fetch_course():

        for item in contents_data['data']:
            cursor.execute('''
-            INSERT INTO contents (id, course_id, content, category, audio_order, attachment_url, mime_type)
-            VALUES (?, ?, ?, ?, ?, ?, ?)
-            ON CONFLICT(id) DO NOTHING
+                           INSERT INTO contents (id, course_id, content, category, audio_order, attachment_url,
+                                                 mime_type)
+                           VALUES (?, ?, ?, ?, ?, ?, ?) ON CONFLICT(id) DO NOTHING
                           ''', (item['id'], course_id, item['content'], item['category'], item['order'],
                                 item['attachment']['url'] if item['attachment'] else None,
                                 item['attachment']['mime_type'] if item['attachment'] else None))
@@ -225,10 +227,19 @@ def download_course_contents(course_ids, course_ids_dict):
            convert_media(mp4_file)


-def get_course():
-    course_ids, course_ids_dict = fetch_course()
+def get_course(course_list=None):
+    course_ids, course_ids_dict = fetch_course(course_list)
    download_course_contents(course_ids, course_ids_dict)


 if __name__ == '__main__':
-    get_course()
+    # 多语的真相 381, 380, 382, 384, 385, 386, 387, 388, 391, 392, 393, 394, 399, 400, 402, 408, 409, 413, 414, 415, 423, 424, 534, 554
+    # 写作课 225, 226, 228, 230, 231, 232, 238, 240, 564, 565, 566, 567, 568, 569, 243
+    # 茶话会 87, 88, 89, 90, 91, 92, 93, 94, 97, 147, 177, 186, 190, 213, 219, 224, 235, 242, 263, 278, 289, 297, 305, 383, 398, 410, 418, 433, 505, 545, 582, 620, 638, 651, 664, 676, 686
+    # 学员故事 364, 367, 368, 370, 371, 372, 373, 374, 375, 376, 377, 378, 313, 320, 319, 406, 405, 404, 401, 403, 411, 416, 417, 421, 422, 427, 430, 447, 438, 444, 446, 485, 487, 489, 493, 496, 498, 502, 507, 509, 513, 515, 518, 519, 529, 537, 544, 557, 575, 584, 590, 595, 598, 599, 606, 608, 611, 613, 618, 621, 624, 628, 633, 635, 637, 653, 642, 645, 648, 652, 655, 657, 662, 665, 671, 673, 674, 677, 680, 682, 685, 687
+    # 相约七年后 484, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474, 475, 476, 477, 478, 60, 61, 86, 95, 148, 233, 246, 273, 291, 317, 396, 412, 426, 442, 492, 506, 552, 589, 607, 619, 644, 656, 672, 683
+    # 笑来分享合集 191, 192, 193, 194, 195, 196, 214, 215, 216, 217, 220, 221, 222, 227, 229, 234, 236, 237, 239, 241, 243, 244, 245, 247, 248, 249, 253, 290, 292, 390, 395, 397, 399, 400, 402, 409, 408, 414, 419, 423, 428, 429, 435, 436, 439, 440, 443, 445, 479, 488, 490, 491, 495, 497, 499, 500, 503, 508, 510, 514, 516, 517, 520, 521, 530, 532, 534, 538, 543, 547, 549, 554, 558, 562, 563, 570, 573, 578, 597, 609, 610, 612, 615, 616, 617, 622, 623, 626, 631, 634, 641, 640, 643, 647, 650, 654, 661, 668, 675, 679
+    # 人工智能 585, 586, 587, 588, 591, 592, 593, 594, 601, 602, 604, 605
+    # course_list = [585, 586, 587, 588, 591, 592, 593, 594, 601, 602, 604, 605]
+    course_list = None
+    get_course(course_list)
--- a/courses.db
+++ b/courses.db
--- a/transcribe_media.py
+++ b/transcribe_media.py
@@ -1,4 +1,5 @@
 import os
+import argparse

 from funasr import AutoModel
 from funasr.utils.postprocess_utils import rich_transcription_postprocess
@@ -16,13 +17,16 @@ def extract_or_convert_audio(file_path, output_audio_path="processed_audio.wav")
    if ext in [".mp4", ".mov", ".avi", ".mkv"]:
        logger.info("🎬 Extracting audio from video...")
        video = VideoFileClip(file_path)
+        if video.audio is None:
+            print("⚠️ 警告：该视频没有音频轨道。")
+            return None
        video.audio.write_audiofile(output_audio_path)
    elif ext in [".mp3", ".wav", ".flac", ".m4a", ".aac"]:
        logger.info("🎧 Converting audio format...")
        sound = AudioSegment.from_file(file_path)
        sound.export(output_audio_path, format="wav")
    else:
-        raise ValueError("Unsupported file type.")
+        raise ValueError(f"Unsupported file type: {ext}")

    return output_audio_path

@@ -51,7 +55,7 @@ def transcribe_audio_funasr(audio_path, device="cuda:0"):
    )

    text = rich_transcription_postprocess(res[0]["text"])
-    return text
+    return split_into_sentences(text)

 def transcribe_audio_funasr_batch(audio_path):
    model = AutoModel(model="iic/SenseVoiceSmall", trust_remote_code=True, device="cuda:0", disable_update=True)
@@ -65,22 +69,82 @@ def transcribe_audio_funasr_batch(audio_path):
    )

    text = rich_transcription_postprocess(res[0]["text"])
-    return text
+    return split_into_sentences(text)
+
+
+# 新增：用于句子分割的符号列表
+SENTENCE_ENDINGS = ["。", "！", "？", ".", "!", "?", "\n"]
+
+
+def split_into_sentences(text, max_length=100):
+    """
+    将文本按句子结束符分割，并确保每行不超过指定长度
+
+    参数:
+    text (str): 待分割的文本
+    max_length (int): 每行最大长度
+
+    返回:
+    str: 处理后的文本，句子间用换行符分隔
+    """
+    if not text:
+        return ""
+
+    # 首先按句子结束符进行分割
+    sentences = []
+    current_sentence = ""
+
+    for char in text:
+        current_sentence += char
+        # 如果遇到句子结束符，则将当前积累的字符添加到句子列表中
+        if char in SENTENCE_ENDINGS:
+            sentences.append(current_sentence.strip())
+            current_sentence = ""
+
+    # 添加最后一个可能不完整的句子
+    if current_sentence.strip():
+        sentences.append(current_sentence.strip())
+
+    # 然后处理过长的句子，确保每行不超过max_length
+    processed_lines = []
+    for sentence in sentences:
+        if len(sentence) <= max_length:
+            processed_lines.append(sentence)
+        else:
+            # 对于过长的句子，按最大长度分割，但尽量在标点符号处分割
+            current_line = ""
+            for i, char in enumerate(sentence):
+                current_line += char
+                # 如果达到最大长度，并且下一个字符是标点符号，或者当前字符是空格，则分割
+                if len(current_line) >= max_length:
+                    if (i + 1 < len(sentence) and sentence[i + 1] in SENTENCE_ENDINGS) or char == ' ':
+                        processed_lines.append(current_line.strip())
+                        current_line = ""
+            # 添加最后一个片段
+            if current_line.strip():
+                processed_lines.append(current_line.strip())
+
+    # 用换行符连接所有处理后的行
+    return "\n".join(processed_lines)


 def convert_media(file_path, is_batch=False, save_to_disk=True):
    try:
        audio_file = extract_or_convert_audio(file_path)
+        if audio_file is None:
+            return None
+
        if is_batch:
            transcript = transcribe_audio_funasr_batch(audio_file)
        else:
            transcript = transcribe_audio_funasr(audio_file)
+
        logger.info("\n📜 Transcript:")
        logger.info(transcript)

-        # ✅ Save transcript to disk
+        # ✅ Save transcript to disk as .txt file
        if save_to_disk:
-            output_path = os.path.splitext(file_path)[0] + ".md"
+            output_path = os.path.splitext(file_path)[0] + ".txt"
            with open(output_path, "w", encoding="utf-8") as f:
                f.write(transcript)
            logger.info(f"✅ Transcript saved to: {output_path}")
@@ -90,18 +154,43 @@ def convert_media(file_path, is_batch=False, save_to_disk=True):
            os.remove("processed_audio.wav")


+def process_input(path, recursive=False):
+    if not os.path.exists(path):
+        logger.error(f"❌ Path does not exist: {path}")
+        return
+
+    supported_exts = {".mp4", ".mov", ".avi", ".mkv", ".mp3", ".wav", ".flac", ".m4a", ".aac"}
+
+    if os.path.isfile(path):
+        ext = os.path.splitext(path)[1].lower()
+        if ext in supported_exts:
+            convert_media(path)
+        else:
+            logger.warning(f"🚫 Unsupported file skipped: {path}")
+    elif os.path.isdir(path):
+        for root, dirs, files in os.walk(path):
+            for file in files:
+                file_path = os.path.join(root, file)
+                ext = os.path.splitext(file)[1].lower()
+                if ext in supported_exts:
+                    try:
+                        convert_media(file_path, False)
+                    except Exception as e:
+                        logger.error(f"Error processing {file_path}: {e}")
+                else:
+                    logger.debug(f"Skipping non-media file: {file_path}")
+            if not recursive:
+                break


 def main():
-    audio_files = []
-    for root, dirs, files in os.walk('media'):
-        for file in files:
-            audio_files.append(os.path.join(root, file))
-    logger.info("scan files: " + audio_files)
+    parser = argparse.ArgumentParser(description="Convert audio/video to text using FunASR.")
+    parser.add_argument("input_path", nargs='?', default="./media", help="Path to a file or folder containing media files. Defaults to './media'.")
+    parser.add_argument("--recursive", "-r", action="store_true", help="Process subdirectories recursively.")
+    args = parser.parse_args()

-    for audio_file in audio_files:
-        audio_file_path = os.path.join(audio_file)
-        convert_media(audio_file_path)
+    input_path = args.input_path
+    process_input(input_path, recursive=args.recursive)


 if __name__ == '__main__':