Feat summarize audio topic (#594)

* refactor ai commands

* fix json command

* fix extract story command

* may summarize topic for audio
This commit is contained in:
an-lee
2024-05-09 10:54:11 +08:00
committed by GitHub
parent 5436b2038c
commit 69a6f721ca
18 changed files with 245 additions and 232 deletions

View File

@@ -1,5 +1,5 @@
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { textCommand } from "./text.command";
export const analyzeCommand = async (
text: string,
@@ -10,29 +10,14 @@ export const analyzeCommand = async (
baseUrl?: string;
}
): Promise<string> => {
const { key, temperature = 0, baseUrl } = options;
let { modelName = "gpt-4-turbo" } = options;
if (!text) throw new Error("Text is required");
const chatModel = new ChatOpenAI({
openAIApiKey: key,
modelName,
temperature,
configuration: {
baseURL: baseUrl,
},
cache: false,
verbose: true,
maxRetries: 2,
});
const prompt = ChatPromptTemplate.fromMessages([
const prompt = await ChatPromptTemplate.fromMessages([
["system", SYSTEM_PROMPT],
["human", text],
]);
]).format({});
const response = await prompt.pipe(chatModel).invoke({});
return response.text;
return textCommand(prompt, options);
};
const SYSTEM_PROMPT = `你是我的英语教练,我将提供英语文本,你将帮助我分析文本的句子结构、语法和词汇/短语,并对文本进行详细解释。请用中文回答,并按以下格式返回结果:

View File

@@ -1,11 +1,9 @@
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { zodToJsonSchema } from "zod-to-json-schema";
import { z } from "zod";
import { RESPONSE_JSON_FORMAT_MODELS } from "@/constants";
import { jsonCommand } from "./json.command";
export const extractStoryCommand = async (
content: string,
text: string,
options: {
key: string;
modelName?: string;
@@ -13,61 +11,27 @@ export const extractStoryCommand = async (
baseUrl?: string;
}
): Promise<{ words: string[]; idioms: string[] }> => {
const { key, temperature = 0, baseUrl } = options;
let { modelName = "gpt-4-turbo" } = options;
if (RESPONSE_JSON_FORMAT_MODELS.indexOf(modelName) === -1) {
modelName = "gpt-4-turbo";
}
const saveExtraction = z.object({
const schema = z.object({
words: z.array(z.string().describe("extracted word")),
idioms: z.array(z.string().describe("extracted idiom")),
});
const chatModel = new ChatOpenAI({
openAIApiKey: key,
modelName,
temperature,
modelKwargs: {
response_format: {
type: "json_object",
},
},
configuration: {
baseURL: baseUrl,
},
cache: true,
verbose: true,
maxRetries: 2,
}).bind({
tools: [
{
type: "function",
function: {
name: "save_extraction",
description: "Save the extracted words and idioms from a text",
parameters: zodToJsonSchema(saveExtraction),
},
},
],
});
const prompt = ChatPromptTemplate.fromMessages([
const prompt = await ChatPromptTemplate.fromMessages([
["system", EXTRACT_STORY_PROMPT],
["human", "{text}"],
]);
const response = await prompt.pipe(chatModel).invoke({
]).format({
learning_language: "English",
text: content,
text,
});
return JSON.parse(
response.additional_kwargs?.tool_calls?.[0]?.function?.arguments || "{}"
);
return jsonCommand(prompt, { ...options, schema });
};
const EXTRACT_STORY_PROMPT = `
I am an {learning_language} beginner and only have a grasp of 500 high-frequency basic words. You are an {learning_language} learning assistant robot, and your task is to analyze the article I provide and extract all the meaningful words and idioms that I may not be familiar with. Specifically, it should include common words used in uncommon ways. Return in JSON format.
I am an {learning_language} beginner and only have a grasp of 500 high-frequency basic words. You are an {learning_language} learning assistant robot, and your task is to analyze the article I provide and extract all the meaningful words and idioms that I may not be familiar with. Specifically, it should include common words used in uncommon ways. Return in JSON format like following:
{{
words: ["word1", "word2", ...],
idiom: ["idiom1", "idiom2", ...]
}}
`;

View File

@@ -2,5 +2,8 @@ export * from "./extract-story.command";
export * from "./lookup.command";
export * from "./translate.command";
export * from "./ipa.command";
export * from "./json.command";
export * from "./analyze.command";
export * from "./punctuate.command";
export * from "./summarize-topic.command";
export * from "./text.command";

View File

@@ -1,11 +1,6 @@
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { z } from "zod";
import {
StructuredOutputParser,
OutputFixingParser,
} from "langchain/output_parsers";
import { RESPONSE_JSON_FORMAT_MODELS } from "@/constants";
import { jsonCommand } from "./json.command";
export const ipaCommand = async (
text: string,
@@ -16,14 +11,9 @@ export const ipaCommand = async (
baseUrl?: string;
}
): Promise<{ words?: { word?: string; ipa?: string }[] }> => {
const { key, temperature = 0, baseUrl } = options;
let { modelName = "gpt-4-turbo" } = options;
if (!text) throw new Error("Text is required");
if (RESPONSE_JSON_FORMAT_MODELS.indexOf(modelName) === -1) {
modelName = "gpt-4-turbo";
}
const responseSchema = z.object({
const schema = z.object({
words: z.array(
z.object({
word: z.string().nonempty(),
@@ -32,51 +22,15 @@ export const ipaCommand = async (
),
});
const parser = StructuredOutputParser.fromZodSchema(responseSchema);
const fixParser = OutputFixingParser.fromLLM(
new ChatOpenAI({
openAIApiKey: key,
modelName,
temperature: 0,
configuration: {
baseURL: baseUrl,
},
}),
parser
);
const chatModel = new ChatOpenAI({
openAIApiKey: key,
modelName,
temperature,
configuration: {
baseURL: baseUrl,
},
modelKwargs: {
response_format: {
type: "json_object",
},
},
cache: true,
verbose: true,
maxRetries: 2,
});
const prompt = ChatPromptTemplate.fromMessages([
const prompt = await ChatPromptTemplate.fromMessages([
["system", SYSTEM_PROMPT],
["human", "{text}"],
]);
const response = await prompt.pipe(chatModel).invoke({
]).format({
learning_language: "English",
text,
});
try {
return await parser.parse(response.text);
} catch (e) {
return await fixParser.parse(response.text);
}
return jsonCommand(prompt, { ...options, schema });
};
const SYSTEM_PROMPT = `Generate an array of JSON objects for each {learning_language} word in the given text, with each object containing two keys: 'word' and 'ipa', where 'ipa' is the International Phonetic Alphabet (IPA) representation of the word. Return the array in JSON format only. The output should be structured like this:

View File

@@ -0,0 +1,48 @@
import { ChatOpenAI } from "@langchain/openai";
import { RESPONSE_JSON_FORMAT_MODELS } from "@/constants";
import { zodToJsonSchema } from "zod-to-json-schema";
export const jsonCommand = async (
prompt: string,
options: {
key: string;
modelName?: string;
temperature?: number;
baseUrl?: string;
schema: any;
}
): Promise<any> => {
const { key, temperature = 0, baseUrl, schema } = options;
let { modelName = "gpt-4-turbo" } = options;
if (RESPONSE_JSON_FORMAT_MODELS.indexOf(modelName) === -1) {
modelName = "gpt-4-turbo";
}
const chatModel = new ChatOpenAI({
openAIApiKey: key,
modelName,
temperature,
modelKwargs: {
response_format: {
type: "json_object",
},
},
configuration: {
baseURL: baseUrl,
},
cache: true,
verbose: true,
maxRetries: 1,
});
const structuredOutput = chatModel.withStructuredOutput(
zodToJsonSchema(schema),
{
method: "jsonMode",
}
);
const response = await structuredOutput.invoke(prompt);
return response;
};

View File

@@ -1,11 +1,6 @@
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { z } from "zod";
import {
StructuredOutputParser,
OutputFixingParser,
} from "langchain/output_parsers";
import { RESPONSE_JSON_FORMAT_MODELS } from "@/constants";
import { jsonCommand } from "./json.command";
export const lookupCommand = async (
params: {
@@ -29,16 +24,9 @@ export const lookupCommand = async (
translation?: string;
lemma?: string;
}> => {
const { key, temperature = 0, baseUrl } = options;
let { modelName = "gpt-4-turbo" } = options;
if (RESPONSE_JSON_FORMAT_MODELS.indexOf(modelName) === -1) {
modelName = "gpt-4-turbo";
}
const { word, context, meaningOptions } = params;
const responseSchema = z.object({
const schema = z.object({
id: z.string().optional(),
word: z.string().optional(),
context_translation: z.string().optional(),
@@ -49,37 +37,10 @@ export const lookupCommand = async (
lemma: z.string().optional(),
});
const parser = StructuredOutputParser.fromZodSchema(responseSchema);
const fixParser = OutputFixingParser.fromLLM(
new ChatOpenAI({
openAIApiKey: key,
modelName,
temperature: 0,
configuration: {
baseURL: baseUrl,
},
}),
parser
);
const chatModel = new ChatOpenAI({
openAIApiKey: key,
modelName,
temperature,
configuration: {
baseURL: baseUrl,
},
cache: true,
verbose: true,
maxRetries: 2,
});
const prompt = ChatPromptTemplate.fromMessages([
const prompt = await ChatPromptTemplate.fromMessages([
["system", DICITIONARY_PROMPT],
["human", "{input}"],
]);
const response = await prompt.pipe(chatModel).invoke({
]).format({
learning_language: "English",
native_language: "Chinese",
input: JSON.stringify({
@@ -89,11 +50,7 @@ export const lookupCommand = async (
}),
});
try {
return await parser.parse(response.text);
} catch (e) {
return await fixParser.parse(response.text);
}
return jsonCommand(prompt, { ...options, schema });
};
const DICITIONARY_PROMPT = `You are an {learning_language}-{native_language} dictionary. I will provide "word(it also maybe a phrase)" and "context" as input, you should return the "word", "lemma", "pronunciation", "pos(part of speech, maybe empty for phrase)", "definition", "translation" and "context_translation" as output. If I provide "definitions", you should try to select the appropriate one for the given context, and return the id of selected definition as "id". If none are suitable, generate a new definition for me. If no context is provided, return the most common definition. If you do not know the appropriate definition, return an empty string for "definition" and "translation".

View File

@@ -1,5 +1,5 @@
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { textCommand } from "./text.command";
export const punctuateCommand = async (
text: string,
@@ -10,29 +10,14 @@ export const punctuateCommand = async (
baseUrl?: string;
}
): Promise<string> => {
const { key, temperature = 0, baseUrl } = options;
let { modelName = "gpt-4-turbo" } = options;
if (!text) throw new Error("Text is required");
const chatModel = new ChatOpenAI({
openAIApiKey: key,
modelName,
temperature,
configuration: {
baseURL: baseUrl,
},
cache: false,
verbose: true,
maxRetries: 2,
});
const prompt = ChatPromptTemplate.fromMessages([
const prompt = await ChatPromptTemplate.fromMessages([
["system", SYSTEM_PROMPT],
["human", text],
]);
]).format({});
const response = await prompt.pipe(chatModel).invoke({});
return response.text;
return textCommand(prompt, options);
};
const SYSTEM_PROMPT = `Please add proper punctuation to the text I provide you. Return the corrected text only.`;

View File

@@ -0,0 +1,24 @@
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { textCommand } from "./text.command";
export const summarizeTopicCommand = async (
text: string,
options: {
key: string;
modelName?: string;
temperature?: number;
baseUrl?: string;
}
): Promise<string> => {
if (!text) throw new Error("Text is required");
const prompt = await ChatPromptTemplate.fromMessages([
["system", SYSTEM_PROMPT],
["human", text],
]).format({});
return textCommand(prompt, options);
};
const SYSTEM_PROMPT =
"Please generate a four to five word title summarizing our conversation without any lead-in, punctuation, quotation marks, periods, symbols, bold text, or additional text. Remove enclosing quotation marks.";

View File

@@ -0,0 +1,31 @@
import { ChatOpenAI } from "@langchain/openai";
export const textCommand = async (
prompt: string,
options: {
key: string;
modelName?: string;
temperature?: number;
baseUrl?: string;
systemPrompt?: string;
}
): Promise<string> => {
const { key, temperature = 0, baseUrl } = options;
let { modelName = "gpt-4-turbo" } = options;
const chatModel = new ChatOpenAI({
openAIApiKey: key,
modelName,
temperature,
configuration: {
baseURL: baseUrl,
},
cache: false,
verbose: true,
maxRetries: 1,
});
const response = await chatModel.invoke(prompt);
return response.text;
};

View File

@@ -1,5 +1,5 @@
import { ChatOpenAI } from "@langchain/openai";
import { ChatPromptTemplate } from "@langchain/core/prompts";
import { textCommand } from "./text.command";
export const translateCommand = async (
text: string,
@@ -10,32 +10,17 @@ export const translateCommand = async (
baseUrl?: string;
}
): Promise<string> => {
const { key, temperature = 0, baseUrl } = options;
let { modelName = "gpt-4-turbo" } = options;
if (!text) throw new Error("Text is required");
const chatModel = new ChatOpenAI({
openAIApiKey: key,
modelName,
temperature,
configuration: {
baseURL: baseUrl,
},
cache: false,
verbose: true,
maxRetries: 2,
});
const prompt = ChatPromptTemplate.fromMessages([
const prompt = await ChatPromptTemplate.fromMessages([
["system", SYSTEM_PROMPT],
["human", TRANSLATION_PROMPT],
]);
const response = await prompt.pipe(chatModel).invoke({
]).format({
native_language: "Chinese",
text,
});
return response.text;
return textCommand(prompt, options);
};
const SYSTEM_PROMPT =

View File

@@ -570,5 +570,6 @@
"noNotesYet": "No notes yet",
"editTranscription": "Edit transcription",
"saveTranscription": "Save transcription",
"areYouSureToSaveTranscription": "It will perform a force-alignment between the audio and your edited transcription. Are you sure to continue?"
"areYouSureToSaveTranscription": "It will perform a force-alignment between the audio and your edited transcription. Are you sure to continue?",
"summarize": "Summarize"
}

View File

@@ -569,5 +569,6 @@
"noNotesYet": "还没有笔记",
"editTranscription": "编辑语音文本",
"saveTranscription": "保存语音文本",
"areYouSureToSaveTranscription": "即将根据您修改后的语音文本对语音重新进行对齐,确定要继续吗?"
"areYouSureToSaveTranscription": "即将根据您修改后的语音文本对语音重新进行对齐,确定要继续吗?",
"summarize": "提炼主题"
}

View File

@@ -1,16 +1,66 @@
import { useContext } from "react";
import { MediaPlayerProviderContext } from "@renderer/context";
import { useContext, useState } from "react";
import {
AppSettingsProviderContext,
MediaPlayerProviderContext,
} from "@renderer/context";
import { formatDuration, formatDateTime } from "@renderer/lib/utils";
import { t } from "i18next";
import { Button, toast } from "@renderer/components/ui";
import { useAiCommand } from "@renderer/hooks";
import { LoaderIcon } from "lucide-react";
export const MediaInfoPanel = () => {
const { media } = useContext(MediaPlayerProviderContext);
const { media, transcription } = useContext(MediaPlayerProviderContext);
const { EnjoyApp } = useContext(AppSettingsProviderContext);
const { summarizeTopic } = useAiCommand();
const [summarizing, setSummarizing] = useState<boolean>(false);
const handleSummarize = async () => {
setSummarizing(true);
try {
const topic = await summarizeTopic(transcription.result.transcript);
if (media.mediaType === "Video") {
await EnjoyApp.videos.update(media.id, {
name: topic,
});
} else if (media.mediaType === "Audio") {
await EnjoyApp.audios.update(media.id, {
name: topic,
});
}
} catch (error) {
toast.error(error);
}
setSummarizing(false);
};
if (!media) return null;
return (
<div className="px-4" data-testid="media-info-panel">
<div className="mb-2">
<div className="flex items-center justify-between">
<div className="capitalize text-sm text-muted-foreground mb-1">
{t("models.audio.name")}
</div>
<Button
disabled={summarizing}
onClick={handleSummarize}
variant="outline"
size="sm"
>
{summarizing && (
<LoaderIcon className="animate-spin mr-2" size={16} />
)}
{t("summarize")}
</Button>
</div>
<div className="">{media.name}</div>
</div>
{[
{ label: t("models.audio.name"), value: media.name },
{
label: t("models.audio.duration"),
value: formatDuration(media.duration),

View File

@@ -34,7 +34,7 @@ import { useCopyToClipboard } from "@uidotdev/usehooks";
import { t } from "i18next";
import { AppSettingsProviderContext } from "@renderer/context";
import Markdown from "react-markdown";
import { useConversation } from "@renderer/hooks";
import { useConversation, useAiCommand } from "@renderer/hooks";
export const AssistantMessageComponent = (props: {
message: MessageType;
@@ -52,6 +52,7 @@ export const AssistantMessageComponent = (props: {
const [shadowing, setShadowing] = useState<boolean>(false);
const { EnjoyApp } = useContext(AppSettingsProviderContext);
const { tts } = useConversation();
const { summarizeTopic } = useAiCommand();
useEffect(() => {
if (speech) return;
@@ -100,11 +101,19 @@ export const AssistantMessageComponent = (props: {
if (!audio) {
setResourcing(true);
let title =
speech.text.length > 20
? speech.text.substring(0, 17).trim() + "..."
: speech.text;
try {
title = await summarizeTopic(speech.text);
} catch (e) {
console.warn(e);
}
await EnjoyApp.audios.create(speech.filePath, {
name:
speech.text.length > 20
? speech.text.substring(0, 17).trim() + "..."
: speech.text,
name: title,
originalText: speech.text,
});
setResourcing(false);
@@ -169,7 +178,7 @@ export const AssistantMessageComponent = (props: {
new URL(props.href ?? "");
props.target = "_blank";
props.rel = "noopener noreferrer";
} catch (e) { }
} catch (e) {}
return <a {...props}>{children}</a>;
},

View File

@@ -510,7 +510,7 @@ export const MediaPlayerProvider = ({
EnjoyApp.waveforms.find(media.md5).then((waveform) => {
setWaveForm(waveform);
});
}, [media]);
}, [media?.md5]);
/*
* Initialize wavesurfer when container ref is available
@@ -524,7 +524,7 @@ export const MediaPlayerProvider = ({
setDecoded(false);
setDecodeError(null);
};
}, [media, ref, mediaProvider, layout?.playerHeight]);
}, [media?.src, ref, mediaProvider, layout?.playerHeight]);
useEffect(() => {
calculateHeight();

View File

@@ -9,6 +9,7 @@ import {
translateCommand,
analyzeCommand,
punctuateCommand,
summarizeTopicCommand,
} from "@commands";
export const useAiCommand = () => {
@@ -108,14 +109,23 @@ export const useAiCommand = () => {
key: currentEngine.key,
modelName: currentEngine.model,
baseUrl: currentEngine.baseUrl,
})
}
});
};
const summarizeTopic = async (text: string) => {
return summarizeTopicCommand(text, {
key: currentEngine.key,
modelName: currentEngine.model,
baseUrl: currentEngine.baseUrl,
});
};
return {
lookupWord,
extractStory,
translate,
analyzeText,
punctuateText
punctuateText,
summarizeTopic,
};
};

View File

@@ -5,17 +5,20 @@ import {
} from "@renderer/context";
import { toast } from "@renderer/components/ui";
import { t } from "i18next";
import { useThrottle } from "@uidotdev/usehooks";
export const useAudio = (options: { id?: string; md5?: string }) => {
const { id, md5 } = options;
const { EnjoyApp } = useContext(AppSettingsProviderContext);
const { addDblistener, removeDbListener } = useContext(DbProviderContext);
const [audio, setAudio] = useState<AudioType>(null);
const throttledAudio = useThrottle(audio, 500);
const onAudioUpdate = (event: CustomEvent) => {
const { model, action, record } = event.detail || {};
if (model !== "Audio") return;
if (record?.id != audio?.id) return;
if (model != "Audio") return;
if (id && record.id !== id) return;
if (md5 && record.md5 !== md5) return;
if (action !== "update") return;
setAudio(record);
@@ -38,6 +41,6 @@ export const useAudio = (options: { id?: string; md5?: string }) => {
}, [id, md5]);
return {
audio,
audio: throttledAudio,
};
};

View File

@@ -5,17 +5,20 @@ import {
} from "@renderer/context";
import { toast } from "@renderer/components/ui";
import { t } from "i18next";
import { useThrottle } from "@uidotdev/usehooks";
export const useVideo = (options: { id?: string; md5?: string }) => {
const { id, md5 } = options;
const { EnjoyApp } = useContext(AppSettingsProviderContext);
const { addDblistener, removeDbListener } = useContext(DbProviderContext);
const [video, setVideo] = useState<VideoType>(null);
const throttledVideo = useThrottle(video, 500);
const onAudioUpdate = (event: CustomEvent) => {
const { model, action, record } = event.detail || {};
if (model !== "Audio") return;
if (record?.id != video?.id) return;
if (model !== "Video") return;
if (id && record.id !== id) return;
if (md5 && record.md5 !== md5) return;
if (action !== "update") return;
setVideo(record);
@@ -38,6 +41,6 @@ export const useVideo = (options: { id?: string; md5?: string }) => {
}, [id, md5]);
return {
video,
video: throttledVideo,
};
};