From a2267b7c7415b24ea62b2b7aaf8193cdf442419c Mon Sep 17 00:00:00 2001 From: lostecho Date: Sun, 20 Apr 2025 08:44:06 +0800 Subject: [PATCH] fix log issue --- course_content_parser.py | 1 + courses.db | Bin 2760704 -> 2777088 bytes logging_config.py | 34 +++++++++++++++++++++++++--------- markdown_transcribe.py | 11 +++++------ pyproject.toml | 1 + requirements.txt | 7 ++++++- transcribe_media.py | 36 ++++++++++++++++++++++++++++-------- uv.lock | 14 ++++++++++++++ 8 files changed, 80 insertions(+), 24 deletions(-) diff --git a/course_content_parser.py b/course_content_parser.py index a51a44a..0b44510 100755 --- a/course_content_parser.py +++ b/course_content_parser.py @@ -177,6 +177,7 @@ def download_course_contents(course_ids, course_ids_dict): shutil.move(combined_audio_filename, course_audio_filename) os.remove(text_file) + # 删除音频文件 for item in audio_files: audio_file_path = os.path.join(course_id_folder, item['attachment']['name']) try: diff --git a/courses.db b/courses.db index 72403bde2245aec5732110cba152a57b82193115..709b556a8ecb5155dc6d2cec00ce0dc31070de85 100755 GIT binary patch delta 15844 zcmeHOX>1(lc_vr1BwCaxJC?2ZB4z8eY?-94sI8kgsO_L_8nx+>1UXhsjQB`Wr)hx( zNIFYWvpYPNOPa^<)RNNj5UJfI#bYm#l!Ky8i(cuE{_u~$Xy==m1%lM|kF@!bw9os! zGqXdkq)4vaq(BXNMb3Q3dp_^+eH*$yQyIE`xw8AAo!_sh2u$Js$%=}K%AUigE5F|y zRD+djFrWsvsKF{Vc$XU7ss^{I!R>0WS`FT<26w2z8a2364enBdwQ6v;8oWmh-m3;b zpa$<#gL~BA{c7+5HMmy|eozfQs0JTWgAc30N0x(+CjM)Cq{hTnfy8o6-S*1uHSg5? zpvJ6uEAwz=-Cyi@lwN4{M3QzKpO>^J@p(}@h|h^gF!R%zy06{4|ICSFUw!TPmOsS9 z-~&~Gqk%xzxj+5V%YXER#->xJo9JEOuQJ_rbzj>Zs5%{Ps0%mLha2{V8y*WcJRWY? zztV7^>Uz~c<=YkdvF`dq`qOV8_z(SHPkpU^^4|}c_h4Pw2lse(23dYd=-{k=nzxyLbKPuD5o5H!v6o1zrlA4SXn2AJ|^`wtiyb z(06q6=#jt4T)%i|SB*aP6hSyNT>8#Qyg zKDTSR_MX~rR9~+CLG|<1+TyeG)7yK`&VPIRht-33RWw)b)USW#k^5daT`xxSA~9eu z4;Nowuv=F}`-nN1Hb+AuF>MW>H`d3B!=sO?hpH-`-PzR8bn;lkiDQkYk6EdZi1v%o zd683{ zm4CLp?TMe%?!CKmprZDbUB9|>V8@#~o~iz=ZLzAq+;XmR0G9cg%Cq6$(e`ew2po-W z@slKZHrql*q4fprAiWQN)&E|SW?i{rx*fB4V!A5?q~#QXy6 zTCDbpwkYh{ULV2?Be!5LOo;yWf4kOs<5%j6NL?_;b7sEV9GkQwvqrwlUYZl>PLWC3 znK7PwlK~2i_}5TNvE`DH)2yy}F*;+8wi($(iP~hJIiZPO4HOvJ5Iyg?2tuvCxRIYW z`!OqGWIKvOQ6nG57%7ZNjIk1H#G7Xs;_n=6Hs-kz|^E>sE4r$V^O| zQ!Q*UXBt3>$SuK>S(M(g5S)nQK~MP_My`Xmro~`O*ZM`Q2R;DT6R|n0gB|SdX;T@S zkWz7E-*~3!47O@KbG!ln5M4ZM<0$eL0u%^w0n~xC!4X{G@pDxAt&%K11qpAK#(HeXQw9}sgsw){U(HAvjr^PoJW;D}#wrYfIR$h%evlf5e!+$v>cm?7+I!1hiG}Rx*F}B`!w4)Qvm&!V zKHIv8A4fZdUa}ME5Lkzy31o>RIV@Oh{pHA)CAR-wVDen1Y3~z%zpejYo*1}qCo+Yt zfn>$5`kMQze{=hhyS@`hB3cXul<+;;lQg5VvSj~F4Ki|y$yG)+_I9$hsTT9XD0vxF zE#)aPS7D*nXot||c`zFm0ESt|T#5^kJbodm>MMvXV9^2ZSAo(1nWga(Q>&wZ3MT-$ z@JANlLF8B~z!4}}oExO)QX0T0m@XVdtm)Xy)MoLgHIjulVpmJbO>cgsZqH|mqalEy zCt(K?9ngvr(j*3oV;8}pk81@uCAzKebObxof!mbK1v(^%1^RYQS`N+P#3OV3F zj#hxY`rFBL+WYP0^B{vX>w8O<&=)tC3jE4mACy8VO~(KjiO46-{vo^NDu#g!GgN>T z*)xbr9lXB19)YS%eG+CM6c_3!Qc}#BH&$by7?bVv7#k^*?w^LY=6DuAV!f5WhreSzyEa(Q9`e-1CCnG9j`p#nFfzoq$B)AfU!xYsEwosx*5Z1GSUA zA@sO0(i~bUj?CB-Gh$GG>strz*LT?mw(4(vw65yK?iq6+y=ikJ7q@1cDS-59u7y!% zNLQDYj!t><0fKDY$c41ej~87vWdHGyftjb&B=`8O`}e zO7PftMivSp2Xv2?c*(_`Co)rUvK(KELy6~V7XgVxQE8w~bS2N^WDSc*Um|Of!xV|6 z9grGl*rtnqEK1G}u@>*=&rQiw6Vup_kRV5R?Ld~y>N!thU2{@Fis8r-nub{;Stov! zY!o=lxML8gw|Gu^1ZXjMd{6`F2rH-E2G*Of<=Sij~46Apn0$Dm4;U%)vDZC9=5SX(C0* z8#c3tRw?MPS8|rx-ANHH)`NvS`-YX!#F25tPRhj=3w{&h)YzfGXD-;ni~-N15n8Cw z1u+2DoV5UbmoorEu#Q*=HPeb5!`zXyWx5k%0Bl$YnkwIeR))pJqJ*@l${F72|+4d%YW<{$z=~Tf0oJ)>>%~$WWG3eo69pipJ?2ias7d#lUJWmt`Ex zH_6{@i`re88U%D;hrM(W+{g&7KXa)LxxIu_UaH*c$8VKUap24j__ zJR2gdjCk&*I_2z;%_yr~pb8Yjbs;my#ZXI1@U&YHxQ7w^*t0~>#v;V0Wwt-ko*VjT zDfr3_*O3~Ql6}T0Xen5O*sidMUG;pEln9iQ-in078d-u)$#MpgG-hQ+ue0j+41A)l zO35ZEk!H6aZ9_?HcFvjYlvlCmvX*Gp#udPMX^aGK$GaKZl9xo{JPeO>K(4wmMf9L( zk3$J$b%b*Ym2Bk_M*Rx-!LC@sgH4t(M+%PANV1IVl2;OWxq5Nzg2-l(L6hUk><^v? zaz#E_-q#oqG2n114`!@# z>i_I4ty$)Uo*=o)gDE8cn|Tl^3F#wR5b?=kb1x}Rg0&QqY+U%#b`-?~d&xoQOHL%v z-!zisK`H#KpYAU?phbyl0xkoe;vi3>0hq}IWrET?o!lN}hV1#=FvG{*_@QE_YHNK( zO{V(CTfeon9)-^Ct@nRgJ57pjzNZwU&@0uymw4JjR|$Vp zJ4IuozwI9@8!eRH$g&#*N(Mthi-`F!&=c|jVtREVd$8GuDujAsY`0u4af)XY;G+Ga z{eroO0-9_G)gJvQ{7+ZGo|q^`Vvq@#0zRNj^};D545Un%XeNamhcqO|a9@is1OXP< z0FY$|m3?LmbLvf*VS3WWcvr#-L`>~7u9}!p?Bw`zZh`X1+yZE_GazW3!i4ORacpCa z;uCL$Hb+5<`yj0ogskcutm%eRuoNwr&nsTlb;EHRq$k5EZwYlULHHAEiP$WcdZKxl zDp^S)YGq$<%*nNQ0Q&UYiMNixPq~4H<_0S!oddgG!w{>pZ1uV=`}UmSYLUZ+oh}%; zDMtp)lXf0xO4Xmprc9_KhW6tu75#F|jkM{3ToLzxad=Nku9>u=Ml2AEAU)}(y zg5=#U+eCQ;X4&Oq_GC+w*p}%dlT8`mnoi}1)B~a{1aXpTiD)xES@#MPD<$Rt81`fc zWh4XZj@=QUqb3eR)TL!G8XH+N9TMo04w3K#Z{MW?ks;BG&p5E6`b*kL^09X1lYS?Y zjW=^thaAx>rKP7YiSQz94}ONFy)fyDIAW*Pi1rjAy_k=&l*tzngNR-<-D2q2`R8Ih zfUOw1h8~`wpnQ|aW~O84=TPY=wj_x6^GHLscsp8Oh5i+hs+6nN6HXc_fn}!rcZ8*u_)84Ru+wmr9vRVSrj<5E|yR zk3cY`CFBZzWClSD&Nf=D=oH9!OYQggB<0l62mvSeC8U3woy}Gu2{K5)u~{gNSVCiF z%uC!9q}cKBbJ__q+D>1yxQP$SoQerS6<~|w5G2hhgvmU?0*p~MU??|O>?e>y$&jTS z19*fjr;!^sThk6!@B!3d znz4;PMFGK!n&h;T{GA$^((eh;+-|HDKSx=R=+;Ty&Nb^XUPk3FQo7U%>a5f#_+_B_ssWcf!fG)ajEp?jA# z+=L!bp*VO@&`@gua$rR~0UUDn!I71$-|a4=iw;!Y4+IQIJU(_Hmam498XIeBB@ z|1}33xRJ~KVrVY>a^&lzxoBf4V|LCjKK`j)#r44rEQ_{cbwxoOe@4Nw%Wp0Mx3;ZU8nwg;FHe)4*mr zg_8$@I1Y1C3cwy0l>X7!gF;a_O7)baOf*wQkKgf}QoTeR=`ylhqK2E&9ue}!iIFW( zo{cCVFO)cK57Z5OvqX_sOUUUH)8&0)+QaQ|fE6cN(76&_F+lAAqW#T9knXjnC{2+W zW%)3gHGy+rRL4p$&GB{h$-LYH7XQ-V02gEEO3Y0brFAGhq}wQn29$@$QV-xw>9v$L zGk`1%!Gxo+Vy}+~brPy=YBaI~$Xh@hr)i*s^9M=NhBG$K4V-4h`Iwy!^ZFE3vN&LK zuBJ$my0~WW>lu-cULj*?Q=*_1q{u($jsZn6w=8fHLS$s4@=W&HxY@Q2`Y@TPC)+@7 z6rs~%jz{!_*><0JHtM`Wkt-82o}!T5xK`|64qYg`Dxbs8y(twHTV9qvcf{{=@@V6R z{HY8#$&`;JrcF)|jne{zdz`?fsi_8mug@2l-QDz;DDxE%jKP;I~Z{&)@@fNTqV zy`m<0_fM)nzAdwLs_I7peP8{7vx{fvYXXW|sU0WJYCBg7YE2FDQiGR6()BJ0t>_>j zccS(kUvtA1m{g9+Q9cpKkBEG~SZFEYTGCpN7q%%UlZ^@rJ2J5=b*L1XxDhP-ZvXm| zGrET%vpg@KF;mOl@5pVqg7;6SSk9TCo0jpVVsBXtplefftm|y)rnh+O4FKe{KB+a4)rC*{ ztxl#=tlQh^^(gPY*>i_212O@56>CGsK!M_YPoZ6Cjzr>n$UPnw`z?Aq@2au0`+e-} zwCvOxNkOgO^ec#OM#QF&`t&`yvA#BW#e3Wbyl465;j+ZVm!- zx}#Pb-SYobI3tjkGdH z0u=ijxh=h7;`pqQjx_07i9C%USdf3otKi;WbTUGi(e7387^UE>jM3xia`cd$;>~3- zZ5wx+1+P=nVL3zPwNw>riY(E);B%8=-{~nRH!sP-4p-28_XYCwUr5ED{x3X$Emj+~ zIyCA=`{S3RCKRs;I@bZMcw$gNtgKswY1G9Yo?}-!$;EwGGU-`d^P#nTXJf2XZ#;N4 z7iVxo0EWuP1CG@=dZkE=!JDOe8Qz43Ns!QO0d|E>W1W92H7J!0=3gS?gK zbSRJ~a3HXzd(8F_w=oQ5eZp+Uc~PmS;rRw+0M-$0VcT#amI{d-U$($;fiyoap>ggf z;pm`KH9M{_$JCjbTm^4rh9}R-Zzrwjj;yTeC*5@c{(wDm(U=$t?uh54Zks z*&X+pHeOf;i=dBb!Z2fP%w8U_v=uW=S71qTnFa39iKP*kDm`2uqKm7N4Fz-j9MfwT z9B|*3{fK$RG}4Q_JVh3cFFI&LWE}E(DaJSh-0$FkX<#$H>0u@(@ihxf34cO+id5G5 zZ7&2U_y!-J%DdmmA>@)rL-2GEcSqetGT1xoE-JTbFG{$eQ&e#IGrNoLuC=H*IN;t} zBzgMs)=jNNXon#IW^qTYMNGVLYY`cTe?QvZ=`C(AUGAK2N8+aA4;=>?XZLMgc+~G#}yZ Ovb#uK8~L>}Q2*Z(Rge(? delta 525 zcmXYtO=#0#9LAG0wrSrq{czgYsma{p+ip6u4^|NwxRbC;PvT+XzaTP2=Rn*!g8vkD z=rRP~o`$W0En&7GU9Mq=34$Kf!Ibk1una zQ0M+CFx+EO*TDj!?mbHbnT$mWWyEqXl z1|u-5XQ-Z|X$wgq^u;zc+th7yz&20W=Adm3ZJ37Wiz`BvW225v*||OAhoZEVs$wbg z{5J3N4K}%NY_hF&+9`sTV=5!xKP;OH~#;#0blByD1md9q82= ?', (start_course_id,)) - # cursor.execute('SELECT id, title FROM courses where id >= 609') course_ids_data = cursor.fetchall() course_ids = [row[0] for row in course_ids_data] course_ids_dict = dict(course_ids_data) @@ -195,10 +197,8 @@ def get_content(): logger.info(f"Processing course ID: {course_id}") json_filename = os.path.join('json', f'{course_id}.json') - # copy_json_file_name = os.path.join('data', 'json', f'{course_ids_dict[course_id]}.json').replace('?', '?') copy_json_file_name = os.path.join('course', f'{course_id}', 'json', f'{course_ids_dict[course_id]}.json').replace('?', '?') - # md_file_name = os.path.join('data', 'markdown', f'{course_ids_dict[course_id]}.md') md_file_name = os.path.join('course', f'{course_id}', f'{course_ids_dict[course_id]}.md') if os.path.exists(json_filename): logger.info(f"Course {course_id} JSON file already exists, using local file.") @@ -218,5 +218,4 @@ def get_content(): if __name__ == '__main__': - # create_audio_transcriptions_table(db_path) get_content() diff --git a/pyproject.toml b/pyproject.toml index 92b2f82..e20d07d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,6 +4,7 @@ version = "0.1.0" description = "Add your description here" requires-python = ">=3.12" dependencies = [ + "colorlog>=6.9.0", "fastapi>=0.111.1", "funasr>=1.1.3", "gradio", diff --git a/requirements.txt b/requirements.txt index c4a8d1a..97612ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,9 @@ funasr>=1.1.3 numpy<=1.26.4 gradio fastapi>=0.111.1 -pymongo~=4.12.0 \ No newline at end of file +pymongo~=4.12.0 +librosa~=0.11.0 +PyYAML~=6.0.2 +jieba~=0.42.1 +colorlog~=6.9.0 +moviepy~=2.1.2 \ No newline at end of file diff --git a/transcribe_media.py b/transcribe_media.py index 1c733fa..f577a1e 100644 --- a/transcribe_media.py +++ b/transcribe_media.py @@ -35,7 +35,8 @@ def transcribe_audio_funasr(audio_path, device="cuda:0"): remote_code="./model.py", # Make sure this file is accessible vad_model="fsmn-vad", vad_kwargs={"max_single_segment_time": 30000}, - device=device + device=device, + disable_update=True ) logger.info("📤 Transcribing with FunASR...") @@ -52,26 +53,45 @@ def transcribe_audio_funasr(audio_path, device="cuda:0"): text = rich_transcription_postprocess(res[0]["text"]) return text +def transcribe_audio_funasr_batch(audio_path): + model = AutoModel(model="iic/SenseVoiceSmall", trust_remote_code=True, device="cuda:0", disable_update=True) -def convert_media(file_path): + res = model.generate( + input=audio_path, + cache={}, + language="auto", # "zh", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size=64, + ) + + text = rich_transcription_postprocess(res[0]["text"]) + return text + + +def convert_media(file_path, is_batch=False, save_to_disk=True): try: audio_file = extract_or_convert_audio(file_path) - transcript = transcribe_audio_funasr(audio_file) + if is_batch: + transcript = transcribe_audio_funasr_batch(audio_file) + else: + transcript = transcribe_audio_funasr(audio_file) logger.info("\n📜 Transcript:") logger.info(transcript) # ✅ Save transcript to disk - output_path = os.path.splitext(file_path)[0] + "_transcript.md" - with open(output_path, "w", encoding="utf-8") as f: - f.write(transcript) - - logger.info(f"✅ Transcript saved to: {output_path}") + if save_to_disk: + output_path = os.path.splitext(file_path)[0] + ".md" + with open(output_path, "w", encoding="utf-8") as f: + f.write(transcript) + logger.info(f"✅ Transcript saved to: {output_path}") return transcript finally: if os.path.exists("processed_audio.wav"): os.remove("processed_audio.wav") + + def main(): audio_files = [] for root, dirs, files in os.walk('media'): diff --git a/uv.lock b/uv.lock index 36afbdb..f2e3562 100644 --- a/uv.lock +++ b/uv.lock @@ -213,6 +213,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335 }, ] +[[package]] +name = "colorlog" +version = "6.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "colorama", marker = "sys_platform == 'win32'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d3/7a/359f4d5df2353f26172b3cc39ea32daa39af8de522205f512f458923e677/colorlog-6.9.0.tar.gz", hash = "sha256:bfba54a1b93b94f54e1f4fe48395725a3d92fd2a4af702f6bd70946bdc0c6ac2", size = 16624 } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/51/9b208e85196941db2f0654ad0357ca6388ab3ed67efdbfc799f35d1f83aa/colorlog-6.9.0-py3-none-any.whl", hash = "sha256:5906e71acd67cb07a71e779c47c4bcb45fb8c2993eebe9e5adcd6a6f1b283eff", size = 11424 }, +] + [[package]] name = "crcmod" version = "1.7" @@ -1604,6 +1616,7 @@ name = "songyi" version = "0.1.0" source = { virtual = "." } dependencies = [ + { name = "colorlog" }, { name = "fastapi" }, { name = "funasr" }, { name = "gradio" }, @@ -1623,6 +1636,7 @@ dependencies = [ [package.metadata] requires-dist = [ + { name = "colorlog", specifier = ">=6.9.0" }, { name = "fastapi", specifier = ">=0.111.1" }, { name = "funasr", specifier = ">=1.1.3" }, { name = "gradio" },