From 0ce1ec18c3f52330cee156ba4f06079b161a31cd Mon Sep 17 00:00:00 2001
From: lostecho <lonelinster@gmail.com>
Date: Tue, 2 Dec 2025 00:19:00 +0800
Subject: [PATCH] update file and add README

---
 .gitignore          |   3 +-
 README.md           | 218 ++++++++++++++++++++++++++++++++++++++++++++
 courses.db          | Bin 8163328 -> 8167424 bytes
 json/745.json       |  32 +++++++
 transcribe_media.py |   4 +-
 5 files changed, 254 insertions(+), 3 deletions(-)
 create mode 100644 README.md
 create mode 100644 json/745.json
diff --git a/.gitignore b/.gitignore
index 4e07bc9..65b999c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -30,7 +30,8 @@
 *.tar.bz2
 *.tgz
 
-*.md
+/markdown/*.md
+/content/*.md
 
 # 其他格式的媒体文件
 /.venv/
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..130dd22
--- /dev/null
+++ b/README.md
@@ -0,0 +1,218 @@
+# Songyi Course Content Scraper & Transcriber
+
+A Python-based automated system for scraping, downloading, and transcribing online course content from the Bandu API. The system converts course materials into Hugo-compatible markdown files with audio/video transcriptions.
+
+## Features
+
+- **Course Data Management**: Fetches and stores course metadata in SQLite/PostgreSQL databases
+- **Multi-threaded Downloads**: Efficiently downloads course materials (audio, video, images, text) using aria2c
+- **Audio Processing**: Automatically combines multiple audio segments into single MP3 files using FFmpeg
+- **Speech-to-Text**: Transcribes audio/video content using FunASR/SenseVoice models
+- **Hugo Integration**: Generates markdown files with proper frontmatter for Hugo static sites
+- **Smart Caching**: Stores transcriptions in database to avoid redundant processing
+
+## Prerequisites
+
+### System Dependencies
+- Python 3.12+
+- FFmpeg
+- aria2c
+
+### Python Dependencies
+See [requirements.txt](requirements.txt) for the full list. Key packages include:
+- requests
+- gradio_client
+- funasr
+- librosa
+- moviepy
+- pymongo
+- psycopg2-binary
+
+## Installation
+
+1. Clone the repository:
+```bash
+git clone <repository-url>
+cd songyi
+```
+
+2. Create and activate virtual environment:
+```bash
+python -m venv .venv
+source .venv/bin/activate  # On Windows: .venv\Scripts\activate
+```
+
+3. Install Python dependencies:
+```bash
+pip install -r requirements.txt
+```
+
+4. Install system dependencies:
+```bash
+# Ubuntu/Debian
+sudo apt-get install ffmpeg aria2
+
+# macOS
+brew install ffmpeg aria2
+```
+
+5. Create configuration file:
+```bash
+cp config.ini.example config.ini
+# Edit config.ini with your settings
+```
+
+## Configuration
+
+Create a `config.ini` file with the following structure:
+
+```ini
+[DEFAULT]
+authorization_token = your_bearer_token_here
+limit = 100
+offset = 0
+sort = newest-first
+max_download_threads = 5
+max_retry_attempts = 3
+download_id = 1
+
+[POSTGRES]
+dbname = your_db_name
+user = your_username
+password = your_password
+host = localhost
+port = 5432
+```
+
+## Usage
+
+### Run Complete Pipeline
+
+Execute the entire workflow (fetch courses, download content, generate markdown):
+
+```bash
+python main.py
+```
+
+### Individual Components
+
+**Fetch course list only:**
+```bash
+python course_list_info_parser.py
+```
+
+**Download course content only:**
+```bash
+python course_content_parser.py
+```
+
+**Generate markdown files only:**
+```bash
+python markdown_transcribe_hugo.py
+```
+
+## Project Structure
+
+```
+songyi/
+├── main.py                          # Main orchestration script
+├── course_list_info_parser.py       # Fetches course metadata
+├── course_content_parser.py         # Downloads course materials
+├── markdown_transcribe_hugo.py      # Generates Hugo markdown
+├── transcribe_media.py              # Audio/video transcription
+├── headers.py                       # HTTP headers configuration
+├── logging_config.py                # Logging setup
+├── config.ini                       # Configuration file (not in repo)
+├── courses.db                       # SQLite database
+├── content/                         # Generated Hugo markdown files
+├── course/                          # Downloaded course materials
+│   └── {course_id}/
+│       ├── mp3/                     # Audio files
+│       ├── mp4/                     # Video files
+│       └── ...
+└── json/                            # API response cache
+    └── {course_id}.json
+```
+
+## Workflow
+
+1. **Fetch Courses**: Retrieves course list from API and stores in database
+2. **Download Content**: Downloads all course materials (audio, video, images, text)
+3. **Process Audio**: Combines audio segments and transcribes them
+4. **Generate Markdown**: Creates Hugo-compatible markdown files with:
+   - Frontmatter (date, title)
+   - Text content
+   - Images with URLs
+   - Audio transcriptions
+
+## Database Schema
+
+### courses
+- `id` (INTEGER PRIMARY KEY)
+- `title` (TEXT)
+- `description` (TEXT)
+
+### contents
+- `id` (INTEGER PRIMARY KEY)
+- `course_id` (INTEGER)
+- `content` (TEXT)
+- `category` (TEXT)
+- `audio_order` (INTEGER)
+- `attachment_url` (TEXT)
+- `mime_type` (TEXT)
+
+### audio_transcriptions
+- `id` (INTEGER PRIMARY KEY AUTOINCREMENT)
+- `course_id` (INTEGER)
+- `filename` (TEXT)
+- `text` (TEXT)
+- `UNIQUE(course_id, filename)`
+
+## Features in Detail
+
+### Multi-threaded Downloads
+Uses thread pools to download multiple files concurrently with configurable retry logic.
+
+### Audio Merging
+Automatically detects multiple audio segments and merges them in order using FFmpeg.
+
+### Transcription Caching
+Stores transcription results in the database to avoid re-processing the same audio files.
+
+### Hugo Output Format
+Generates markdown files with proper Hugo frontmatter:
+```markdown
++++
+date = '2025-10-08'
+draft = false
+title = 'Course Title'
++++
+
+Course content here...
+```
+
+## Error Handling
+
+- Automatic retry for failed downloads (configurable)
+- Skips existing files to avoid redundant downloads
+- Logs all operations for debugging
+- Graceful handling of missing or corrupted files
+
+## Logging
+
+Logs are configured through [logging_config.py](logging_config.py). Check console output for progress and error messages.
+
+## Contributing
+
+This is a personal project for archiving online course content. Feel free to fork and adapt for your own needs.
+
+## License
+
+[Add your license here]
+
+## Notes
+
+- Ensure you have proper authorization to download and process the course content
+- The system is designed for the Bandu API structure; modifications needed for other sources
+- Transcription quality depends on the FunASR/SenseVoice model configuration
+- Large courses may require significant disk space and processing time
diff --git a/courses.db b/courses.db
index b5d99f850050b58eb642ae67761065591f8e6f7f..23763321e86914248100d90ec8a7d231e3af9f4d 100755
GIT binary patch
delta 670
zcmZY7Nl#O87{%fHr!7=%xut^2P+AM9#fr27rBEj42}J9R3MwE7Vr8BdB+$^1l4vA0
z+?ZJ6t|1sh1fnLgc46QXuxVOYk>D5L$C|}+^5$&L<{b;~sK!=zRPQr2E=fxI`@vGh
zQ81PIy*Vsvjq;4xYd`7?Rv|B^thxhYadX8Z7XKUPbcvFb@Xy~iu9znkvp@n=P(y=6
zBteS<IEX{gL62m}Fdzj+97ZbAa0Ewj4C%-~CbD2cHuknHI1Vdvkc&K=z)75f4f!|?
zJI<g0XHf_ToH&Of6r%(#l){ZNl*5AxRH6#isKI$$z(ribWn95kT*GzTz)jSm4!2N`
z+qeTS?&2Qq;{hJx5gwxfPw*7a(1<2H#|ykfGg{D!S7<{!UZVq@c!Msyg%9u0jUM!(
z5B(UxAcioEk!8g^+UomcDN}2N9Fh16L-nlqQE$sp?T=BR_xjt_lDEZK=qz?PLJRA5
uQ)pw&uFZ|l&qsd-W1qil|5i5t%tlutvCvZV*VnDdU|gAq2bN;rrgs6Ocl~St

delta 570
zcmWm7Ni%{00D$4|D=ml?yHNH$6|zOy_npX2$xillnr0kanGR;YQ+|T!kfW=M-{G7;
z;B|Or-rF1gOvuC0g#6j4a0r6x>&5IMEFtf^KdM#hwUQ_ZvhRpSS0XB2ZC15CZgrbI
zV(jO`DaL*cZ<<KRr+k!5Pm)Q5400qv0VPyWBN-`3g$8LzhZZ{MVL%2lk%esJAQySa
zhY<xRL=pbqW)!0Yr6@x=Do}|kRHFv9s6#z0Xn++q*wKh4G{b=ww4x2|=ztTQ=z<Gw
zbfX8o=tDmSFo+=xV+5lZ!#E}|i78BD2D6yMJUm#yBD`3_GFGsPHLPO;o7jR6erzLv
m9qeKc`#8WMj&O_<1aXQpoZ|wQxWYAVaEm+K$0gH4AovHTjL{|l

diff --git a/json/745.json b/json/745.json
new file mode 100644
index 0000000..6327443
--- /dev/null
+++ b/json/745.json
@@ -0,0 +1,32 @@
+{
+    "ts": 1764605503269,
+    "data": [
+        {
+            "id": 14900,
+            "course_id": 745,
+            "content": "7d05ce95-2467-4744-b317-8eac65568b93.m3u8",
+            "category": "video",
+            "attachment_id": "7d05ce95-2467-4744-b317-8eac65568b93",
+            "order": 0,
+            "duration": 4308320,
+            "created_at": "2025-11-30T13:01:32.073Z",
+            "updated_at": "2025-11-30T13:03:19.25Z",
+            "attachment": {
+                "id": 102535,
+                "attachment_id": "7d05ce95-2467-4744-b317-8eac65568b93",
+                "name": "7d05ce95-2467-4744-b317-8eac65568b93.m3u8",
+                "thumb": "",
+                "raw": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8",
+                "size": 0,
+                "duration": 4308320,
+                "mime_type": "application/x-mpegurl",
+                "location": "qiniu",
+                "created_at": "2025-11-30T13:01:32.07Z",
+                "updated_at": "2025-11-30T13:03:19.246Z",
+                "url": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8",
+                "raw_url": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8",
+                "thumb_url": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8"
+            }
+        }
+    ]
+}
\ No newline at end of file
diff --git a/transcribe_media.py b/transcribe_media.py
index f632dff..e770e05 100644
--- a/transcribe_media.py
+++ b/transcribe_media.py
@@ -31,7 +31,7 @@ def extract_or_convert_audio(file_path, output_audio_path="processed_audio.wav")
     return output_audio_path
 
 
-def transcribe_audio_funasr(audio_path, device="cuda:0"):
+def transcribe_audio_funasr(audio_path, device="cpu"):
     logger.info("🧠 Loading FunASR model...")
     model = AutoModel(
         model="iic/SenseVoiceSmall",
@@ -59,7 +59,7 @@ def transcribe_audio_funasr(audio_path, device="cuda:0"):
 
 
 # 加载模型并作为全局变量
-default_model = AutoModel(model="iic/SenseVoiceSmall", trust_remote_code=True, device="cuda:0", disable_update=True)
+default_model = AutoModel(model="iic/SenseVoiceSmall", trust_remote_code=True, device="cpu", disable_update=True)
 
 def transcribe_audio_funasr_batch(audio_path):
     res = default_model.generate(