diff --git a/course_content_parser.py b/course_content_parser.py index a51a44a..0b44510 100755 --- a/course_content_parser.py +++ b/course_content_parser.py @@ -177,6 +177,7 @@ def download_course_contents(course_ids, course_ids_dict): shutil.move(combined_audio_filename, course_audio_filename) os.remove(text_file) + # 删除音频文件 for item in audio_files: audio_file_path = os.path.join(course_id_folder, item['attachment']['name']) try: diff --git a/courses.db b/courses.db index 72403bd..709b556 100755 Binary files a/courses.db and b/courses.db differ diff --git a/logging_config.py b/logging_config.py index 731f855..7597abc 100644 --- a/logging_config.py +++ b/logging_config.py @@ -1,18 +1,34 @@ import logging +import sys +import colorlog def setup_logging(): logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) - # Create a console handler - console_handler = logging.StreamHandler() - console_handler.setLevel(logging.INFO) + # 检查是否已经存在处理器 + if not logger.hasHandlers(): + # Create a console handler + console_handler = colorlog.StreamHandler(sys.stdout) + console_handler.setLevel(logging.INFO) - # Create a formatter and add it to the handler - formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') - console_handler.setFormatter(formatter) + # Create a formatter and add it to the handler + formatter = colorlog.ColoredFormatter( + '%(asctime)s - %(name)s - %(log_color)s%(levelname)s%(reset)s - %(message)s', + log_colors={ + 'DEBUG': 'cyan', + 'INFO': 'green', + 'WARNING': 'yellow', + 'ERROR': 'red', + 'CRITICAL': 'bold_red', + } + ) + console_handler.setFormatter(formatter) - # Add the handler to the logger - logger.addHandler(console_handler) + # Add the handler to the logger + logger.addHandler(console_handler) - return logger \ No newline at end of file + return logger + +if __name__ == "__main__": + logger = setup_logging() diff --git a/markdown_transcribe.py b/markdown_transcribe.py index 3e9637b..336f808 100644 --- a/markdown_transcribe.py +++ b/markdown_transcribe.py @@ -9,13 +9,15 @@ from os import makedirs import requests import json +from course_content_parser import max_download_threads from logging_config import setup_logging from transcribe_media import convert_media # 读取配置文件 config = configparser.ConfigParser() config.read('config.ini') -max_download_threads = int(config['DEFAULT']['max_download_threads']) +# max_download_threads = int(config['DEFAULT']['max_download_threads']) +max_download_threads = 1 logger = setup_logging() @@ -59,6 +61,7 @@ db_path = 'courses.db' # 数据库文件路径 # 下载音频文件 def download_file(url, local_path): + logger.info("download voice file: " + url + " to " + local_path) try: with requests.get(url, stream=True) as r: r.raise_for_status() @@ -75,7 +78,7 @@ def download_file(url, local_path): # 调用api将语音转换为文本 def voice2txt(voice_path): - text = convert_media(voice_path) + text = convert_media(voice_path, True, False) return text @@ -180,7 +183,6 @@ def get_content(): # 查询courses表中的所有课程ID cursor.execute('SELECT id, title FROM courses where id >= ?', (start_course_id,)) - # cursor.execute('SELECT id, title FROM courses where id >= 609') course_ids_data = cursor.fetchall() course_ids = [row[0] for row in course_ids_data] course_ids_dict = dict(course_ids_data) @@ -195,10 +197,8 @@ def get_content(): logger.info(f"Processing course ID: {course_id}") json_filename = os.path.join('json', f'{course_id}.json') - # copy_json_file_name = os.path.join('data', 'json', f'{course_ids_dict[course_id]}.json').replace('?', '?') copy_json_file_name = os.path.join('course', f'{course_id}', 'json', f'{course_ids_dict[course_id]}.json').replace('?', '?') - # md_file_name = os.path.join('data', 'markdown', f'{course_ids_dict[course_id]}.md') md_file_name = os.path.join('course', f'{course_id}', f'{course_ids_dict[course_id]}.md') if os.path.exists(json_filename): logger.info(f"Course {course_id} JSON file already exists, using local file.") @@ -218,5 +218,4 @@ def get_content(): if __name__ == '__main__': - # create_audio_transcriptions_table(db_path) get_content() diff --git a/pyproject.toml b/pyproject.toml index 92b2f82..e20d07d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ version = "0.1.0" description = "Add your description here" requires-python = ">=3.12" dependencies = [ + "colorlog>=6.9.0", "fastapi>=0.111.1", "funasr>=1.1.3", "gradio", diff --git a/requirements.txt b/requirements.txt index c4a8d1a..97612ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,9 @@ funasr>=1.1.3 numpy<=1.26.4 gradio fastapi>=0.111.1 -pymongo~=4.12.0 \ No newline at end of file +pymongo~=4.12.0 +librosa~=0.11.0 +PyYAML~=6.0.2 +jieba~=0.42.1 +colorlog~=6.9.0 +moviepy~=2.1.2 \ No newline at end of file diff --git a/transcribe_media.py b/transcribe_media.py index 1c733fa..f577a1e 100644 --- a/transcribe_media.py +++ b/transcribe_media.py @@ -35,7 +35,8 @@ def transcribe_audio_funasr(audio_path, device="cuda:0"): remote_code="./model.py", # Make sure this file is accessible vad_model="fsmn-vad", vad_kwargs={"max_single_segment_time": 30000}, - device=device + device=device, + disable_update=True ) logger.info("📤 Transcribing with FunASR...") @@ -52,26 +53,45 @@ def transcribe_audio_funasr(audio_path, device="cuda:0"): text = rich_transcription_postprocess(res[0]["text"]) return text +def transcribe_audio_funasr_batch(audio_path): + model = AutoModel(model="iic/SenseVoiceSmall", trust_remote_code=True, device="cuda:0", disable_update=True) -def convert_media(file_path): + res = model.generate( + input=audio_path, + cache={}, + language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size=64, + ) + + text = rich_transcription_postprocess(res[0]["text"]) + return text + + +def convert_media(file_path, is_batch=False, save_to_disk=True): try: audio_file = extract_or_convert_audio(file_path) - transcript = transcribe_audio_funasr(audio_file) + if is_batch: + transcript = transcribe_audio_funasr_batch(audio_file) + else: + transcript = transcribe_audio_funasr(audio_file) logger.info("\n📜 Transcript:") logger.info(transcript) # ✅ Save transcript to disk - output_path = os.path.splitext(file_path)[0] + "_transcript.md" - with open(output_path, "w", encoding="utf-8") as f: - f.write(transcript) - - logger.info(f"✅ Transcript saved to: {output_path}") + if save_to_disk: + output_path = os.path.splitext(file_path)[0] + ".md" + with open(output_path, "w", encoding="utf-8") as f: + f.write(transcript) + logger.info(f"✅ Transcript saved to: {output_path}") return transcript finally: if os.path.exists("processed_audio.wav"): os.remove("processed_audio.wav") + + def main(): audio_files = [] for root, dirs, files in os.walk('media'): diff --git a/uv.lock b/uv.lock index 36afbdb..f2e3562 100644 --- a/uv.lock +++ b/uv.lock @@ -213,6 +213,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] +[[package]] +name = "colorlog" +version = "6.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d3/7a/359f4d5df2353f26172b3cc39ea32daa39af8de522205f512f458923e677/colorlog-6.9.0.tar.gz", hash = "sha256:bfba54a1b93b94f54e1f4fe48395725a3d92fd2a4af702f6bd70946bdc0c6ac2", size = 16624 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/51/9b208e85196941db2f0654ad0357ca6388ab3ed67efdbfc799f35d1f83aa/colorlog-6.9.0-py3-none-any.whl", hash = "sha256:5906e71acd67cb07a71e779c47c4bcb45fb8c2993eebe9e5adcd6a6f1b283eff", size = 11424 }, +] + [[package]] name = "crcmod" version = "1.7" @@ -1604,6 +1616,7 @@ name = "songyi" version = "0.1.0" source = { virtual = "." } dependencies = [ + { name = "colorlog" }, { name = "fastapi" }, { name = "funasr" }, { name = "gradio" }, @@ -1623,6 +1636,7 @@ dependencies = [ [package.metadata] requires-dist = [ + { name = "colorlog", specifier = ">=6.9.0" }, { name = "fastapi", specifier = ">=0.111.1" }, { name = "funasr", specifier = ">=1.1.3" }, { name = "gradio" },