diff --git a/.gitignore b/.gitignore index 4e07bc9..65b999c 100644 --- a/.gitignore +++ b/.gitignore @@ -30,7 +30,8 @@ *.tar.bz2 *.tgz -*.md +/markdown/*.md +/content/*.md # 其他格式的媒体文件 /.venv/ diff --git a/README.md b/README.md new file mode 100644 index 0000000..130dd22 --- /dev/null +++ b/README.md @@ -0,0 +1,218 @@ +# Songyi Course Content Scraper & Transcriber + +A Python-based automated system for scraping, downloading, and transcribing online course content from the Bandu API. The system converts course materials into Hugo-compatible markdown files with audio/video transcriptions. + +## Features + +- **Course Data Management**: Fetches and stores course metadata in SQLite/PostgreSQL databases +- **Multi-threaded Downloads**: Efficiently downloads course materials (audio, video, images, text) using aria2c +- **Audio Processing**: Automatically combines multiple audio segments into single MP3 files using FFmpeg +- **Speech-to-Text**: Transcribes audio/video content using FunASR/SenseVoice models +- **Hugo Integration**: Generates markdown files with proper frontmatter for Hugo static sites +- **Smart Caching**: Stores transcriptions in database to avoid redundant processing + +## Prerequisites + +### System Dependencies +- Python 3.12+ +- FFmpeg +- aria2c + +### Python Dependencies +See [requirements.txt](requirements.txt) for the full list. Key packages include: +- requests +- gradio_client +- funasr +- librosa +- moviepy +- pymongo +- psycopg2-binary + +## Installation + +1. Clone the repository: +```bash +git clone +cd songyi +``` + +2. Create and activate virtual environment: +```bash +python -m venv .venv +source .venv/bin/activate # On Windows: .venv\Scripts\activate +``` + +3. Install Python dependencies: +```bash +pip install -r requirements.txt +``` + +4. Install system dependencies: +```bash +# Ubuntu/Debian +sudo apt-get install ffmpeg aria2 + +# macOS +brew install ffmpeg aria2 +``` + +5. Create configuration file: +```bash +cp config.ini.example config.ini +# Edit config.ini with your settings +``` + +## Configuration + +Create a `config.ini` file with the following structure: + +```ini +[DEFAULT] +authorization_token = your_bearer_token_here +limit = 100 +offset = 0 +sort = newest-first +max_download_threads = 5 +max_retry_attempts = 3 +download_id = 1 + +[POSTGRES] +dbname = your_db_name +user = your_username +password = your_password +host = localhost +port = 5432 +``` + +## Usage + +### Run Complete Pipeline + +Execute the entire workflow (fetch courses, download content, generate markdown): + +```bash +python main.py +``` + +### Individual Components + +**Fetch course list only:** +```bash +python course_list_info_parser.py +``` + +**Download course content only:** +```bash +python course_content_parser.py +``` + +**Generate markdown files only:** +```bash +python markdown_transcribe_hugo.py +``` + +## Project Structure + +``` +songyi/ +├── main.py # Main orchestration script +├── course_list_info_parser.py # Fetches course metadata +├── course_content_parser.py # Downloads course materials +├── markdown_transcribe_hugo.py # Generates Hugo markdown +├── transcribe_media.py # Audio/video transcription +├── headers.py # HTTP headers configuration +├── logging_config.py # Logging setup +├── config.ini # Configuration file (not in repo) +├── courses.db # SQLite database +├── content/ # Generated Hugo markdown files +├── course/ # Downloaded course materials +│ └── {course_id}/ +│ ├── mp3/ # Audio files +│ ├── mp4/ # Video files +│ └── ... +└── json/ # API response cache + └── {course_id}.json +``` + +## Workflow + +1. **Fetch Courses**: Retrieves course list from API and stores in database +2. **Download Content**: Downloads all course materials (audio, video, images, text) +3. **Process Audio**: Combines audio segments and transcribes them +4. **Generate Markdown**: Creates Hugo-compatible markdown files with: + - Frontmatter (date, title) + - Text content + - Images with URLs + - Audio transcriptions + +## Database Schema + +### courses +- `id` (INTEGER PRIMARY KEY) +- `title` (TEXT) +- `description` (TEXT) + +### contents +- `id` (INTEGER PRIMARY KEY) +- `course_id` (INTEGER) +- `content` (TEXT) +- `category` (TEXT) +- `audio_order` (INTEGER) +- `attachment_url` (TEXT) +- `mime_type` (TEXT) + +### audio_transcriptions +- `id` (INTEGER PRIMARY KEY AUTOINCREMENT) +- `course_id` (INTEGER) +- `filename` (TEXT) +- `text` (TEXT) +- `UNIQUE(course_id, filename)` + +## Features in Detail + +### Multi-threaded Downloads +Uses thread pools to download multiple files concurrently with configurable retry logic. + +### Audio Merging +Automatically detects multiple audio segments and merges them in order using FFmpeg. + +### Transcription Caching +Stores transcription results in the database to avoid re-processing the same audio files. + +### Hugo Output Format +Generates markdown files with proper Hugo frontmatter: +```markdown ++++ +date = '2025-10-08' +draft = false +title = 'Course Title' ++++ + +Course content here... +``` + +## Error Handling + +- Automatic retry for failed downloads (configurable) +- Skips existing files to avoid redundant downloads +- Logs all operations for debugging +- Graceful handling of missing or corrupted files + +## Logging + +Logs are configured through [logging_config.py](logging_config.py). Check console output for progress and error messages. + +## Contributing + +This is a personal project for archiving online course content. Feel free to fork and adapt for your own needs. + +## License + +[Add your license here] + +## Notes + +- Ensure you have proper authorization to download and process the course content +- The system is designed for the Bandu API structure; modifications needed for other sources +- Transcription quality depends on the FunASR/SenseVoice model configuration +- Large courses may require significant disk space and processing time diff --git a/courses.db b/courses.db index b5d99f8..2376332 100755 Binary files a/courses.db and b/courses.db differ diff --git a/json/745.json b/json/745.json new file mode 100644 index 0000000..6327443 --- /dev/null +++ b/json/745.json @@ -0,0 +1,32 @@ +{ + "ts": 1764605503269, + "data": [ + { + "id": 14900, + "course_id": 745, + "content": "7d05ce95-2467-4744-b317-8eac65568b93.m3u8", + "category": "video", + "attachment_id": "7d05ce95-2467-4744-b317-8eac65568b93", + "order": 0, + "duration": 4308320, + "created_at": "2025-11-30T13:01:32.073Z", + "updated_at": "2025-11-30T13:03:19.25Z", + "attachment": { + "id": 102535, + "attachment_id": "7d05ce95-2467-4744-b317-8eac65568b93", + "name": "7d05ce95-2467-4744-b317-8eac65568b93.m3u8", + "thumb": "", + "raw": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8", + "size": 0, + "duration": 4308320, + "mime_type": "application/x-mpegurl", + "location": "qiniu", + "created_at": "2025-11-30T13:01:32.07Z", + "updated_at": "2025-11-30T13:03:19.246Z", + "url": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8", + "raw_url": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8", + "thumb_url": "https://pili-vod.songy.info/7d05ce95-2467-4744-b317-8eac65568b93.m3u8" + } + } + ] +} \ No newline at end of file diff --git a/transcribe_media.py b/transcribe_media.py index f632dff..e770e05 100644 --- a/transcribe_media.py +++ b/transcribe_media.py @@ -31,7 +31,7 @@ def extract_or_convert_audio(file_path, output_audio_path="processed_audio.wav") return output_audio_path -def transcribe_audio_funasr(audio_path, device="cuda:0"): +def transcribe_audio_funasr(audio_path, device="cpu"): logger.info("🧠 Loading FunASR model...") model = AutoModel( model="iic/SenseVoiceSmall", @@ -59,7 +59,7 @@ def transcribe_audio_funasr(audio_path, device="cuda:0"): # 加载模型并作为全局变量 -default_model = AutoModel(model="iic/SenseVoiceSmall", trust_remote_code=True, device="cuda:0", disable_update=True) +default_model = AutoModel(model="iic/SenseVoiceSmall", trust_remote_code=True, device="cpu", disable_update=True) def transcribe_audio_funasr_batch(audio_path): res = default_model.generate(