change audio to text use local service
This commit is contained in:
BIN
courses.db
BIN
courses.db
Binary file not shown.
@@ -7,6 +7,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|||||||
from os import makedirs
|
from os import makedirs
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
from gradio_client import Client, handle_file
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
||||||
@@ -14,15 +15,6 @@ import json
|
|||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
config.read('config.ini')
|
config.read('config.ini')
|
||||||
max_download_threads = int(config['DEFAULT']['max_download_threads'])
|
max_download_threads = int(config['DEFAULT']['max_download_threads'])
|
||||||
# start_course_id = int(config['DEFAULT']['start_course_id'])
|
|
||||||
|
|
||||||
# 转译url
|
|
||||||
trans_url = 'https://api.siliconflow.cn/v1/audio/transcriptions'
|
|
||||||
|
|
||||||
headers = {
|
|
||||||
"Authorization": "Bearer sk-lakndqcjlmtukekcliwkkryaxquifduhvzgcnlhofzvofllv",
|
|
||||||
# "Content-Type": "multipart/form-data"
|
|
||||||
}
|
|
||||||
|
|
||||||
# 设置日志配置
|
# 设置日志配置
|
||||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||||
@@ -55,9 +47,9 @@ def create_audio_transcriptions_table(db_path):
|
|||||||
try:
|
try:
|
||||||
# 执行创建表的SQL语句
|
# 执行创建表的SQL语句
|
||||||
cursor.execute(CREATE_TABLE_SQL)
|
cursor.execute(CREATE_TABLE_SQL)
|
||||||
print("表audio_transcriptions创建成功。")
|
logging.info("表audio_transcriptions创建成功。")
|
||||||
except sqlite3.Error as e:
|
except sqlite3.Error as e:
|
||||||
print(f"创建表时出错: {e}")
|
logging.error(f"创建表时出错: {e}")
|
||||||
finally:
|
finally:
|
||||||
# 关闭数据库连接
|
# 关闭数据库连接
|
||||||
conn.close()
|
conn.close()
|
||||||
@@ -67,8 +59,6 @@ def create_audio_transcriptions_table(db_path):
|
|||||||
db_path = 'courses.db' # 数据库文件路径
|
db_path = 'courses.db' # 数据库文件路径
|
||||||
|
|
||||||
|
|
||||||
# create_audio_transcriptions_table(db_path)
|
|
||||||
|
|
||||||
# 下载音频文件
|
# 下载音频文件
|
||||||
def download_file(url, local_path):
|
def download_file(url, local_path):
|
||||||
try:
|
try:
|
||||||
@@ -87,23 +77,15 @@ def download_file(url, local_path):
|
|||||||
|
|
||||||
# 调用api将语音转换为文本
|
# 调用api将语音转换为文本
|
||||||
def voice2txt(voice_path):
|
def voice2txt(voice_path):
|
||||||
url = trans_url
|
|
||||||
multipart_form_data = {
|
|
||||||
'file': ('audio.mp3', open(voice_path, 'rb')),
|
|
||||||
'model': (None, 'FunAudioLLM/SenseVoiceSmall')
|
|
||||||
}
|
|
||||||
response = requests.request("POST", url, files=multipart_form_data, headers=headers)
|
|
||||||
# 检查请求是否成功
|
|
||||||
if response.status_code == 200:
|
|
||||||
# 解析JSON响应
|
|
||||||
data = response.json()
|
|
||||||
|
|
||||||
# 提取text的值
|
client = Client("http://192.168.31.3:7860/")
|
||||||
text_value = data.get('text', None) # 使用get方法可以避免KeyError,如果'text'键不存在则返回None
|
text = client.predict(
|
||||||
logging.info(f"Text value: {text_value}")
|
input_wav=handle_file(voice_path),
|
||||||
return text_value
|
language="zh",
|
||||||
else:
|
api_name="/model_inference"
|
||||||
print('请求失败,状态码:', response.status_code)
|
)
|
||||||
|
logging.info(text)
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
# 保存文本到数据库
|
# 保存文本到数据库
|
||||||
@@ -199,9 +181,9 @@ def get_content():
|
|||||||
max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1') # 获取数据库中最大的课程ID
|
max_course_id = cursor.execute('SELECT id FROM courses ORDER BY id DESC LIMIT 1') # 获取数据库中最大的课程ID
|
||||||
if max_course_id:
|
if max_course_id:
|
||||||
max_course_id = max_course_id.fetchone()[0]
|
max_course_id = max_course_id.fetchone()[0]
|
||||||
print(f"The maximum course ID is {max_course_id}")
|
logging.info(f"The maximum course ID is {max_course_id}")
|
||||||
else:
|
else:
|
||||||
print("No courses found in the database.")
|
logging.info("No courses found in the database.")
|
||||||
max_course_id = 11
|
max_course_id = 11
|
||||||
start_course_id = max_course_id - 5
|
start_course_id = max_course_id - 5
|
||||||
|
|
||||||
@@ -219,7 +201,7 @@ def get_content():
|
|||||||
|
|
||||||
# 先请求全部的链接获取数据,并将获取到的课程信息保存到数据库中
|
# 先请求全部的链接获取数据,并将获取到的课程信息保存到数据库中
|
||||||
for course_id in course_ids:
|
for course_id in course_ids:
|
||||||
print(f"Processing course ID: {course_id}")
|
logging.info(f"Processing course ID: {course_id}")
|
||||||
|
|
||||||
json_filename = os.path.join('json', f'{course_id}.json')
|
json_filename = os.path.join('json', f'{course_id}.json')
|
||||||
# copy_json_file_name = os.path.join('data', 'json', f'{course_ids_dict[course_id]}.json').replace('?', '?')
|
# copy_json_file_name = os.path.join('data', 'json', f'{course_ids_dict[course_id]}.json').replace('?', '?')
|
||||||
@@ -228,17 +210,16 @@ def get_content():
|
|||||||
# md_file_name = os.path.join('data', 'markdown', f'{course_ids_dict[course_id]}.md')
|
# md_file_name = os.path.join('data', 'markdown', f'{course_ids_dict[course_id]}.md')
|
||||||
md_file_name = os.path.join('course', f'{course_id}', f'{course_ids_dict[course_id]}.md')
|
md_file_name = os.path.join('course', f'{course_id}', f'{course_ids_dict[course_id]}.md')
|
||||||
if os.path.exists(json_filename):
|
if os.path.exists(json_filename):
|
||||||
print(f"Course {course_id} JSON file already exists, using local file.")
|
logging.info(f"Course {course_id} JSON file already exists, using local file.")
|
||||||
makedirs(f'course/{course_id}/json', exist_ok=True)
|
makedirs(f'course/{course_id}/json', exist_ok=True)
|
||||||
shutil.copy2(json_filename, copy_json_file_name)
|
shutil.copy2(json_filename, copy_json_file_name)
|
||||||
json_to_markdown(copy_json_file_name, md_file_name)
|
json_to_markdown(copy_json_file_name, md_file_name)
|
||||||
else:
|
else:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# logseq_md_file_name = os.path.join('data', 'markdown_logseq', f'{course_ids_dict[course_id]}.md')
|
|
||||||
logseq_md_file_name = os.path.join('course', f'{course_id}', f'{course_ids_dict[course_id]}_logseq.md')
|
logseq_md_file_name = os.path.join('course', f'{course_id}', f'{course_ids_dict[course_id]}_logseq.md')
|
||||||
if os.path.exists(json_filename):
|
if os.path.exists(json_filename):
|
||||||
print(f"Course {course_id} JSON file already exists, using local file.")
|
logging.info(f"Course {course_id} JSON file already exists, using local file.")
|
||||||
shutil.copy2(json_filename, copy_json_file_name)
|
shutil.copy2(json_filename, copy_json_file_name)
|
||||||
json_to_markdown(copy_json_file_name, logseq_md_file_name, logseq=True)
|
json_to_markdown(copy_json_file_name, logseq_md_file_name, logseq=True)
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ from gradio_client import Client, handle_file
|
|||||||
from pydub import AudioSegment
|
from pydub import AudioSegment
|
||||||
from pydub.silence import split_on_silence
|
from pydub.silence import split_on_silence
|
||||||
|
|
||||||
use_remote_api = True
|
use_remote_api = False
|
||||||
process_workers = 5 if use_remote_api else 1
|
process_workers = 5 if use_remote_api else 1
|
||||||
|
|
||||||
config = configparser.ConfigParser()
|
config = configparser.ConfigParser()
|
||||||
|
|||||||
Reference in New Issue
Block a user