Files
songyi/transcribe_media.py
2025-04-15 09:28:39 +08:00

71 lines
2.2 KiB
Python

import os
from moviepy.video.io.VideoFileClip import VideoFileClip
from pydub import AudioSegment
from funasr import AutoModel
from funasr.utils.postprocess_utils import rich_transcription_postprocess
def extract_or_convert_audio(file_path, output_audio_path="processed_audio.wav"):
ext = os.path.splitext(file_path)[1].lower()
if ext in [".mp4", ".mov", ".avi", ".mkv"]:
print("🎬 Extracting audio from video...")
video = VideoFileClip(file_path)
video.audio.write_audiofile(output_audio_path)
elif ext in [".mp3", ".wav", ".flac", ".m4a", ".aac"]:
print("🎧 Converting audio format...")
sound = AudioSegment.from_file(file_path)
sound.export(output_audio_path, format="wav")
else:
raise ValueError("Unsupported file type.")
return output_audio_path
def transcribe_audio_funasr(audio_path, device="cuda:0"):
print("🧠 Loading FunASR model...")
model = AutoModel(
model="iic/SenseVoiceSmall",
trust_remote_code=True,
remote_code="./model.py", # Make sure this file is accessible
vad_model="fsmn-vad",
vad_kwargs={"max_single_segment_time": 30000},
device=device
)
print("📤 Transcribing with FunASR...")
res = model.generate(
input=audio_path,
cache={},
language="auto",
use_itn=True,
batch_size_s=60,
merge_vad=True,
merge_length_s=15,
)
text = rich_transcription_postprocess(res[0]["text"])
return text
def convert_media(file_path):
try:
audio_file = extract_or_convert_audio(file_path)
transcript = transcribe_audio_funasr(audio_file)
print("\n📜 Transcript:")
print(transcript)
# ✅ Save transcript to disk
output_path = os.path.splitext(file_path)[0] + "_transcript.md"
with open(output_path, "w", encoding="utf-8") as f:
f.write(transcript)
print(f"✅ Transcript saved to: {output_path}")
return transcript
finally:
if os.path.exists("processed_audio.wav"):
os.remove("processed_audio.wav")
if __name__ == '__main__':
convert_media("./course/676/mp4/20250413142836-第36期茶话会-视频-1.mp4")