fix transcribe error

(cherry picked from commit 255cc192690654535c4ebeecec1ef6500943f42e)
update file and add README
2025-12-23 10:33:47 +08:00 · 2025-12-02 00:19:00 +08:00
5 changed files with 262 additions and 6 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -30,7 +30,8 @@
 *.tar.bz2
 *.tgz

-*.md
+/markdown/*.md
+/content/*.md

 # 其他格式的媒体文件
 /.venv/
--- a/README.md
+++ b/README.md
@@ -0,0 +1,218 @@
+# Songyi Course Content Scraper & Transcriber
+
+A Python-based automated system for scraping, downloading, and transcribing online course content from the Bandu API. The system converts course materials into Hugo-compatible markdown files with audio/video transcriptions.
+
+## Features
+
+- **Course Data Management**: Fetches and stores course metadata in SQLite/PostgreSQL databases
+- **Multi-threaded Downloads**: Efficiently downloads course materials (audio, video, images, text) using aria2c
+- **Audio Processing**: Automatically combines multiple audio segments into single MP3 files using FFmpeg
+- **Speech-to-Text**: Transcribes audio/video content using FunASR/SenseVoice models
+- **Hugo Integration**: Generates markdown files with proper frontmatter for Hugo static sites
+- **Smart Caching**: Stores transcriptions in database to avoid redundant processing
+
+## Prerequisites
+
+### System Dependencies
+- Python 3.12+
+- FFmpeg
+- aria2c
+
+### Python Dependencies
+See [requirements.txt](requirements.txt) for the full list. Key packages include:
+- requests
+- gradio_client
+- funasr
+- librosa
+- moviepy
+- pymongo
+- psycopg2-binary
+
+## Installation
+
+1. Clone the repository:
+```bash
+git clone <repository-url>
+cd songyi
+```
+
+2. Create and activate virtual environment:
+```bash
+python -m venv .venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+```
+
+3. Install Python dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+4. Install system dependencies:
+```bash
+# Ubuntu/Debian
+sudo apt-get install ffmpeg aria2
+
+# macOS
+brew install ffmpeg aria2
+```
+
+5. Create configuration file:
+```bash
+cp config.ini.example config.ini
+# Edit config.ini with your settings
+```
+
+## Configuration
+
+Create a `config.ini` file with the following structure:
+
+```ini
+[DEFAULT]
+authorization_token = your_bearer_token_here
+limit = 100
+offset = 0
+sort = newest-first
+max_download_threads = 5
+max_retry_attempts = 3
+download_id = 1
+
+[POSTGRES]
+dbname = your_db_name
+user = your_username
+password = your_password
+host = localhost
+port = 5432
+```
+
+## Usage
+
+### Run Complete Pipeline
+
+Execute the entire workflow (fetch courses, download content, generate markdown):
+
+```bash
+python main.py
+```
+
+### Individual Components
+
+**Fetch course list only:**
+```bash
+python course_list_info_parser.py
+```
+
+**Download course content only:**
+```bash
+python course_content_parser.py
+```
+
+**Generate markdown files only:**
+```bash
+python markdown_transcribe_hugo.py
+```
+
+## Project Structure
+
+```
+songyi/
+├── main.py                          # Main orchestration script
+├── course_list_info_parser.py       # Fetches course metadata
+├── course_content_parser.py         # Downloads course materials
+├── markdown_transcribe_hugo.py      # Generates Hugo markdown
+├── transcribe_media.py              # Audio/video transcription
+├── headers.py                       # HTTP headers configuration
+├── logging_config.py                # Logging setup
+├── config.ini                       # Configuration file (not in repo)
+├── courses.db                       # SQLite database
+├── content/                         # Generated Hugo markdown files
+├── course/                          # Downloaded course materials
+│   └── {course_id}/
+│       ├── mp3/                     # Audio files
+│       ├── mp4/                     # Video files
+│       └── ...
+└── json/                            # API response cache
+    └── {course_id}.json
+```
+
+## Workflow
+
+1. **Fetch Courses**: Retrieves course list from API and stores in database
+2. **Download Content**: Downloads all course materials (audio, video, images, text)
+3. **Process Audio**: Combines audio segments and transcribes them
+4. **Generate Markdown**: Creates Hugo-compatible markdown files with:
+   - Frontmatter (date, title)
+   - Text content
+   - Images with URLs
+   - Audio transcriptions
+
+## Database Schema
+
+### courses
+- `id` (INTEGER PRIMARY KEY)
+- `title` (TEXT)
+- `description` (TEXT)
+
+### contents
+- `id` (INTEGER PRIMARY KEY)
+- `course_id` (INTEGER)
+- `content` (TEXT)
+- `category` (TEXT)
+- `audio_order` (INTEGER)
+- `attachment_url` (TEXT)
+- `mime_type` (TEXT)
+
+### audio_transcriptions
+- `id` (INTEGER PRIMARY KEY AUTOINCREMENT)
+- `course_id` (INTEGER)
+- `filename` (TEXT)
+- `text` (TEXT)
+- `UNIQUE(course_id, filename)`
+
+## Features in Detail
+
+### Multi-threaded Downloads
+Uses thread pools to download multiple files concurrently with configurable retry logic.
+
+### Audio Merging
+Automatically detects multiple audio segments and merges them in order using FFmpeg.
+
+### Transcription Caching
+Stores transcription results in the database to avoid re-processing the same audio files.
+
+### Hugo Output Format
+Generates markdown files with proper Hugo frontmatter:
+```markdown
+++
+date = '2025-10-08'
+draft = false
+title = 'Course Title'
+++
+
+Course content here...
+```
+
+## Error Handling
+
+- Automatic retry for failed downloads (configurable)
+- Skips existing files to avoid redundant downloads
+- Logs all operations for debugging
+- Graceful handling of missing or corrupted files
+
+## Logging
+
+Logs are configured through [logging_config.py](logging_config.py). Check console output for progress and error messages.
+
+## Contributing
+
+This is a personal project for archiving online course content. Feel free to fork and adapt for your own needs.
+
+## License
+
+[Add your license here]
+
+## Notes
+
+- Ensure you have proper authorization to download and process the course content
+- The system is designed for the Bandu API structure; modifications needed for other sources
+- Transcription quality depends on the FunASR/SenseVoice model configuration
+- Large courses may require significant disk space and processing time
--- a/courses.db
+++ b/courses.db
--- a/json/745.json
+++ b/json/745.json
@@ -0,0 +1,32 @@
+{
+    "ts": 1764605503269,
+    "data": [
+        {
+            "id": 14900,
+            "course_id": 745,
+            "content": "7d05ce95-2467-4744-b317-8eac65568b93.m3u8",
+            "category": "video",
+            "attachment_id": "7d05ce95-2467-4744-b317-8eac65568b93",
+            "order": 0,
+            "duration": 4308320,
+            "created_at": "2025-11-30T13:01:32.073Z",
+            "updated_at": "2025-11-30T13:03:19.25Z",
+            "attachment": {
+                "id": 102535,
+                "attachment_id": "7d05ce95-2467-4744-b317-8eac65568b93",
+                "name": "7d05ce95-2467-4744-b317-8eac65568b93.m3u8",
+                "thumb": "",
+                "raw": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8",
+                "size": 0,
+                "duration": 4308320,
+                "mime_type": "application/x-mpegurl",
+                "location": "qiniu",
+                "created_at": "2025-11-30T13:01:32.07Z",
+                "updated_at": "2025-11-30T13:03:19.246Z",
+                "url": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8",
+                "raw_url": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8",
+                "thumb_url": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8"
+            }
+        }
+    ]
+}
--- a/transcribe_media.py
+++ b/transcribe_media.py
@@ -1,5 +1,6 @@
 import os
 import argparse
+import uuid

 from funasr import AutoModel
 from funasr.utils.postprocess_utils import rich_transcription_postprocess
@@ -11,8 +12,11 @@ from logging_config import setup_logging
 logger = setup_logging()


-def extract_or_convert_audio(file_path, output_audio_path="processed_audio.wav"):
+def extract_or_convert_audio(file_path, output_audio_path="processed_audio"):
    ext = os.path.splitext(file_path)[1].lower()
+    filename = os.path.basename(file_path)
+    random_uuid = str(uuid.uuid4())
+    output_audio_path = output_audio_path + "_" + random_uuid + ".wav"

    if ext in [".mp4", ".mov", ".avi", ".mkv"]:
        logger.info("🎬 Extracting audio from video...")
@@ -27,11 +31,12 @@ def extract_or_convert_audio(file_path, output_audio_path="processed_audio.wav")
        sound.export(output_audio_path, format="wav")
    else:
        raise ValueError(f"Unsupported file type: {ext}")
+    logger.info(f"Converted Audio saved to: {output_audio_path}")

    return output_audio_path


-def transcribe_audio_funasr(audio_path, device="cuda:0"):
+def transcribe_audio_funasr(audio_path, device="cpu"):
    logger.info("🧠 Loading FunASR model...")
    model = AutoModel(
        model="iic/SenseVoiceSmall",
@@ -59,7 +64,7 @@ def transcribe_audio_funasr(audio_path, device="cuda:0"):


 # 加载模型并作为全局变量
-default_model = AutoModel(model="iic/SenseVoiceSmall", trust_remote_code=True, device="cuda:0", disable_update=True)
+default_model = AutoModel(model="iic/SenseVoiceSmall", trust_remote_code=True, device="cpu", disable_update=True)

 def transcribe_audio_funasr_batch(audio_path):
    res = default_model.generate(
@@ -152,8 +157,8 @@ def convert_media(file_path, is_batch=False, save_to_disk=True):
            logger.info(f"✅ Transcript saved to: {output_path}")
        return transcript
    finally:
-        if os.path.exists("processed_audio.wav"):
-            os.remove("processed_audio.wav")
+        if os.path.exists(audio_file):
+            os.remove(audio_file)


 def process_input(path, recursive=False):
Author	SHA1	Message	Date
YuanHui	e823388753	fix transcribe error (cherry picked from commit 255cc192690654535c4ebeecec1ef6500943f42e)	2025-12-23 10:33:47 +08:00
lostecho	0ce1ec18c3	update file and add README	2025-12-02 00:19:00 +08:00