From 69a6f721ca7081c92619632f0d41839cc5d323c3 Mon Sep 17 00:00:00 2001
From: an-lee <an.lee.work@gmail.com>
Date: Thu, 9 May 2024 10:54:11 +0800
Subject: [PATCH] Feat summarize audio topic (#594)

* refactor ai commands

* fix json command

* fix extract story command

* may summarize topic for audio
---
 enjoy/src/commands/analyze.command.ts         | 25 ++------
 enjoy/src/commands/extract-story.command.ts   | 62 ++++---------------
 enjoy/src/commands/index.ts                   |  3 +
 enjoy/src/commands/ipa.command.ts             | 58 ++---------------
 enjoy/src/commands/json.command.ts            | 48 ++++++++++++++
 enjoy/src/commands/lookup.command.ts          | 53 ++--------------
 enjoy/src/commands/punctuate.command.ts       | 25 ++------
 enjoy/src/commands/summarize-topic.command.ts | 24 +++++++
 enjoy/src/commands/text.command.ts            | 31 ++++++++++
 enjoy/src/commands/translate.command.ts       | 25 ++------
 enjoy/src/i18n/en.json                        |  3 +-
 enjoy/src/i18n/zh-CN.json                     |  3 +-
 .../components/medias/media-info-panel.tsx    | 58 +++++++++++++++--
 .../components/messages/assistant-message.tsx | 21 +++++--
 .../context/media-player-provider.tsx         |  4 +-
 enjoy/src/renderer/hooks/use-ai-command.tsx   | 16 ++++-
 enjoy/src/renderer/hooks/use-audio.tsx        |  9 ++-
 enjoy/src/renderer/hooks/use-video.tsx        |  9 ++-
 18 files changed, 245 insertions(+), 232 deletions(-)
 create mode 100644 enjoy/src/commands/json.command.ts
 create mode 100644 enjoy/src/commands/summarize-topic.command.ts
 create mode 100644 enjoy/src/commands/text.command.ts
diff --git a/enjoy/src/commands/analyze.command.ts b/enjoy/src/commands/analyze.command.ts
index d22bf375..f6ebc883 100644
--- a/enjoy/src/commands/analyze.command.ts
+++ b/enjoy/src/commands/analyze.command.ts
@@ -1,5 +1,5 @@
-import { ChatOpenAI } from "@langchain/openai";
 import { ChatPromptTemplate } from "@langchain/core/prompts";
+import { textCommand } from "./text.command";
 
 export const analyzeCommand = async (
   text: string,
@@ -10,29 +10,14 @@ export const analyzeCommand = async (
     baseUrl?: string;
   }
 ): Promise<string> => {
-  const { key, temperature = 0, baseUrl } = options;
-  let { modelName = "gpt-4-turbo" } = options;
+  if (!text) throw new Error("Text is required");
 
-  const chatModel = new ChatOpenAI({
-    openAIApiKey: key,
-    modelName,
-    temperature,
-    configuration: {
-      baseURL: baseUrl,
-    },
-    cache: false,
-    verbose: true,
-    maxRetries: 2,
-  });
-
-  const prompt = ChatPromptTemplate.fromMessages([
+  const prompt = await ChatPromptTemplate.fromMessages([
     ["system", SYSTEM_PROMPT],
     ["human", text],
-  ]);
+  ]).format({});
 
-  const response = await prompt.pipe(chatModel).invoke({});
-
-  return response.text;
+  return textCommand(prompt, options);
 };
 
 const SYSTEM_PROMPT = `你是我的英语教练，我将提供英语文本，你将帮助我分析文本的句子结构、语法和词汇/短语，并对文本进行详细解释。请用中文回答，并按以下格式返回结果：
diff --git a/enjoy/src/commands/extract-story.command.ts b/enjoy/src/commands/extract-story.command.ts
index 4d53fffe..1168c27a 100644
--- a/enjoy/src/commands/extract-story.command.ts
+++ b/enjoy/src/commands/extract-story.command.ts
@@ -1,11 +1,9 @@
-import { ChatOpenAI } from "@langchain/openai";
 import { ChatPromptTemplate } from "@langchain/core/prompts";
-import { zodToJsonSchema } from "zod-to-json-schema";
 import { z } from "zod";
-import { RESPONSE_JSON_FORMAT_MODELS } from "@/constants";
+import { jsonCommand } from "./json.command";
 
 export const extractStoryCommand = async (
-  content: string,
+  text: string,
   options: {
     key: string;
     modelName?: string;
@@ -13,61 +11,27 @@ export const extractStoryCommand = async (
     baseUrl?: string;
   }
 ): Promise<{ words: string[]; idioms: string[] }> => {
-  const { key, temperature = 0, baseUrl } = options;
-  let { modelName = "gpt-4-turbo" } = options;
-
-  if (RESPONSE_JSON_FORMAT_MODELS.indexOf(modelName) === -1) {
-    modelName = "gpt-4-turbo";
-  }
-
-  const saveExtraction = z.object({
+  const schema = z.object({
     words: z.array(z.string().describe("extracted word")),
     idioms: z.array(z.string().describe("extracted idiom")),
   });
 
-  const chatModel = new ChatOpenAI({
-    openAIApiKey: key,
-    modelName,
-    temperature,
-    modelKwargs: {
-      response_format: {
-        type: "json_object",
-      },
-    },
-    configuration: {
-      baseURL: baseUrl,
-    },
-    cache: true,
-    verbose: true,
-    maxRetries: 2,
-  }).bind({
-    tools: [
-      {
-        type: "function",
-        function: {
-          name: "save_extraction",
-          description: "Save the extracted words and idioms from a text",
-          parameters: zodToJsonSchema(saveExtraction),
-        },
-      },
-    ],
-  });
-
-  const prompt = ChatPromptTemplate.fromMessages([
+  const prompt = await ChatPromptTemplate.fromMessages([
     ["system", EXTRACT_STORY_PROMPT],
     ["human", "{text}"],
-  ]);
-
-  const response = await prompt.pipe(chatModel).invoke({
+  ]).format({
     learning_language: "English",
-    text: content,
+    text,
   });
 
-  return JSON.parse(
-    response.additional_kwargs?.tool_calls?.[0]?.function?.arguments || "{}"
-  );
+  return jsonCommand(prompt, { ...options, schema });
 };
 
 const EXTRACT_STORY_PROMPT = `
-I am an {learning_language} beginner and only have a grasp of 500 high-frequency basic words. You are an {learning_language} learning assistant robot, and your task is to analyze the article I provide and extract all the meaningful words and idioms that I may not be familiar with. Specifically, it should include common words used in uncommon ways. Return in JSON format.
+I am an {learning_language} beginner and only have a grasp of 500 high-frequency basic words. You are an {learning_language} learning assistant robot, and your task is to analyze the article I provide and extract all the meaningful words and idioms that I may not be familiar with. Specifically, it should include common words used in uncommon ways. Return in JSON format like following:
+
+{{
+  words: ["word1", "word2", ...],
+  idiom: ["idiom1", "idiom2", ...]
+}}
 `;
diff --git a/enjoy/src/commands/index.ts b/enjoy/src/commands/index.ts
index 25f8bd05..cbd587fc 100644
--- a/enjoy/src/commands/index.ts
+++ b/enjoy/src/commands/index.ts
@@ -2,5 +2,8 @@ export * from "./extract-story.command";
 export * from "./lookup.command";
 export * from "./translate.command";
 export * from "./ipa.command";
+export * from "./json.command";
 export * from "./analyze.command";
 export * from "./punctuate.command";
+export * from "./summarize-topic.command";
+export * from "./text.command";
diff --git a/enjoy/src/commands/ipa.command.ts b/enjoy/src/commands/ipa.command.ts
index d4c50f19..1f04bdd2 100644
--- a/enjoy/src/commands/ipa.command.ts
+++ b/enjoy/src/commands/ipa.command.ts
@@ -1,11 +1,6 @@
-import { ChatOpenAI } from "@langchain/openai";
 import { ChatPromptTemplate } from "@langchain/core/prompts";
 import { z } from "zod";
-import {
-  StructuredOutputParser,
-  OutputFixingParser,
-} from "langchain/output_parsers";
-import { RESPONSE_JSON_FORMAT_MODELS } from "@/constants";
+import { jsonCommand } from "./json.command";
 
 export const ipaCommand = async (
   text: string,
@@ -16,14 +11,9 @@ export const ipaCommand = async (
     baseUrl?: string;
   }
 ): Promise<{ words?: { word?: string; ipa?: string }[] }> => {
-  const { key, temperature = 0, baseUrl } = options;
-  let { modelName = "gpt-4-turbo" } = options;
+  if (!text) throw new Error("Text is required");
 
-  if (RESPONSE_JSON_FORMAT_MODELS.indexOf(modelName) === -1) {
-    modelName = "gpt-4-turbo";
-  }
-
-  const responseSchema = z.object({
+  const schema = z.object({
     words: z.array(
       z.object({
         word: z.string().nonempty(),
@@ -32,51 +22,15 @@ export const ipaCommand = async (
     ),
   });
 
-  const parser = StructuredOutputParser.fromZodSchema(responseSchema);
-  const fixParser = OutputFixingParser.fromLLM(
-    new ChatOpenAI({
-      openAIApiKey: key,
-      modelName,
-      temperature: 0,
-      configuration: {
-        baseURL: baseUrl,
-      },
-    }),
-    parser
-  );
-
-  const chatModel = new ChatOpenAI({
-    openAIApiKey: key,
-    modelName,
-    temperature,
-    configuration: {
-      baseURL: baseUrl,
-    },
-    modelKwargs: {
-      response_format: {
-        type: "json_object",
-      },
-    },
-    cache: true,
-    verbose: true,
-    maxRetries: 2,
-  });
-
-  const prompt = ChatPromptTemplate.fromMessages([
+  const prompt = await ChatPromptTemplate.fromMessages([
     ["system", SYSTEM_PROMPT],
     ["human", "{text}"],
-  ]);
-
-  const response = await prompt.pipe(chatModel).invoke({
+  ]).format({
     learning_language: "English",
     text,
   });
 
-  try {
-    return await parser.parse(response.text);
-  } catch (e) {
-    return await fixParser.parse(response.text);
-  }
+  return jsonCommand(prompt, { ...options, schema });
 };
 
 const SYSTEM_PROMPT = `Generate an array of JSON objects for each {learning_language} word in the given text, with each object containing two keys: 'word' and 'ipa', where 'ipa' is the International Phonetic Alphabet (IPA) representation of the word. Return the array in JSON format only. The output should be structured like this:
diff --git a/enjoy/src/commands/json.command.ts b/enjoy/src/commands/json.command.ts
new file mode 100644
index 00000000..7cbd9a65
--- /dev/null
+++ b/enjoy/src/commands/json.command.ts
@@ -0,0 +1,48 @@
+import { ChatOpenAI } from "@langchain/openai";
+import { RESPONSE_JSON_FORMAT_MODELS } from "@/constants";
+import { zodToJsonSchema } from "zod-to-json-schema";
+
+export const jsonCommand = async (
+  prompt: string,
+  options: {
+    key: string;
+    modelName?: string;
+    temperature?: number;
+    baseUrl?: string;
+    schema: any;
+  }
+): Promise<any> => {
+  const { key, temperature = 0, baseUrl, schema } = options;
+  let { modelName = "gpt-4-turbo" } = options;
+
+  if (RESPONSE_JSON_FORMAT_MODELS.indexOf(modelName) === -1) {
+    modelName = "gpt-4-turbo";
+  }
+
+  const chatModel = new ChatOpenAI({
+    openAIApiKey: key,
+    modelName,
+    temperature,
+    modelKwargs: {
+      response_format: {
+        type: "json_object",
+      },
+    },
+    configuration: {
+      baseURL: baseUrl,
+    },
+    cache: true,
+    verbose: true,
+    maxRetries: 1,
+  });
+
+  const structuredOutput = chatModel.withStructuredOutput(
+    zodToJsonSchema(schema),
+    {
+      method: "jsonMode",
+    }
+  );
+
+  const response = await structuredOutput.invoke(prompt);
+  return response;
+};
diff --git a/enjoy/src/commands/lookup.command.ts b/enjoy/src/commands/lookup.command.ts
index 68e0eb99..3e72acc9 100644
--- a/enjoy/src/commands/lookup.command.ts
+++ b/enjoy/src/commands/lookup.command.ts
@@ -1,11 +1,6 @@
-import { ChatOpenAI } from "@langchain/openai";
 import { ChatPromptTemplate } from "@langchain/core/prompts";
 import { z } from "zod";
-import {
-  StructuredOutputParser,
-  OutputFixingParser,
-} from "langchain/output_parsers";
-import { RESPONSE_JSON_FORMAT_MODELS } from "@/constants";
+import { jsonCommand } from "./json.command";
 
 export const lookupCommand = async (
   params: {
@@ -29,16 +24,9 @@ export const lookupCommand = async (
   translation?: string;
   lemma?: string;
 }> => {
-  const { key, temperature = 0, baseUrl } = options;
-  let { modelName = "gpt-4-turbo" } = options;
-
-  if (RESPONSE_JSON_FORMAT_MODELS.indexOf(modelName) === -1) {
-    modelName = "gpt-4-turbo";
-  }
-
   const { word, context, meaningOptions } = params;
 
-  const responseSchema = z.object({
+  const schema = z.object({
     id: z.string().optional(),
     word: z.string().optional(),
     context_translation: z.string().optional(),
@@ -49,37 +37,10 @@ export const lookupCommand = async (
     lemma: z.string().optional(),
   });
 
-  const parser = StructuredOutputParser.fromZodSchema(responseSchema);
-  const fixParser = OutputFixingParser.fromLLM(
-    new ChatOpenAI({
-      openAIApiKey: key,
-      modelName,
-      temperature: 0,
-      configuration: {
-        baseURL: baseUrl,
-      },
-    }),
-    parser
-  );
-
-  const chatModel = new ChatOpenAI({
-    openAIApiKey: key,
-    modelName,
-    temperature,
-    configuration: {
-      baseURL: baseUrl,
-    },
-    cache: true,
-    verbose: true,
-    maxRetries: 2,
-  });
-
-  const prompt = ChatPromptTemplate.fromMessages([
+  const prompt = await ChatPromptTemplate.fromMessages([
     ["system", DICITIONARY_PROMPT],
     ["human", "{input}"],
-  ]);
-
-  const response = await prompt.pipe(chatModel).invoke({
+  ]).format({
     learning_language: "English",
     native_language: "Chinese",
     input: JSON.stringify({
@@ -89,11 +50,7 @@ export const lookupCommand = async (
     }),
   });
 
-  try {
-    return await parser.parse(response.text);
-  } catch (e) {
-    return await fixParser.parse(response.text);
-  }
+  return jsonCommand(prompt, { ...options, schema });
 };
 
 const DICITIONARY_PROMPT = `You are an {learning_language}-{native_language} dictionary. I will provide "word(it also maybe a phrase)" and "context" as input, you should return the "word", "lemma", "pronunciation", "pos(part of speech, maybe empty for phrase)", "definition", "translation" and "context_translation" as output. If I provide "definitions", you should try to select the appropriate one for the given context, and return the id of selected definition as "id". If none are suitable, generate a new definition for me. If no context is provided, return the most common definition. If you do not know the appropriate definition, return an empty string for "definition" and "translation".
diff --git a/enjoy/src/commands/punctuate.command.ts b/enjoy/src/commands/punctuate.command.ts
index d885c0fc..2ba31657 100644
--- a/enjoy/src/commands/punctuate.command.ts
+++ b/enjoy/src/commands/punctuate.command.ts
@@ -1,5 +1,5 @@
-import { ChatOpenAI } from "@langchain/openai";
 import { ChatPromptTemplate } from "@langchain/core/prompts";
+import { textCommand } from "./text.command";
 
 export const punctuateCommand = async (
   text: string,
@@ -10,29 +10,14 @@ export const punctuateCommand = async (
     baseUrl?: string;
   }
 ): Promise<string> => {
-  const { key, temperature = 0, baseUrl } = options;
-  let { modelName = "gpt-4-turbo" } = options;
+  if (!text) throw new Error("Text is required");
 
-  const chatModel = new ChatOpenAI({
-    openAIApiKey: key,
-    modelName,
-    temperature,
-    configuration: {
-      baseURL: baseUrl,
-    },
-    cache: false,
-    verbose: true,
-    maxRetries: 2,
-  });
-
-  const prompt = ChatPromptTemplate.fromMessages([
+  const prompt = await ChatPromptTemplate.fromMessages([
     ["system", SYSTEM_PROMPT],
     ["human", text],
-  ]);
+  ]).format({});
 
-  const response = await prompt.pipe(chatModel).invoke({});
-
-  return response.text;
+  return textCommand(prompt, options);
 };
 
 const SYSTEM_PROMPT = `Please add proper punctuation to the text I provide you. Return the corrected text only.`;
diff --git a/enjoy/src/commands/summarize-topic.command.ts b/enjoy/src/commands/summarize-topic.command.ts
new file mode 100644
index 00000000..74031ecc
--- /dev/null
+++ b/enjoy/src/commands/summarize-topic.command.ts
@@ -0,0 +1,24 @@
+import { ChatPromptTemplate } from "@langchain/core/prompts";
+import { textCommand } from "./text.command";
+
+export const summarizeTopicCommand = async (
+  text: string,
+  options: {
+    key: string;
+    modelName?: string;
+    temperature?: number;
+    baseUrl?: string;
+  }
+): Promise<string> => {
+  if (!text) throw new Error("Text is required");
+
+  const prompt = await ChatPromptTemplate.fromMessages([
+    ["system", SYSTEM_PROMPT],
+    ["human", text],
+  ]).format({});
+
+  return textCommand(prompt, options);
+};
+
+const SYSTEM_PROMPT =
+  "Please generate a four to five word title summarizing our conversation without any lead-in, punctuation, quotation marks, periods, symbols, bold text, or additional text. Remove enclosing quotation marks.";
diff --git a/enjoy/src/commands/text.command.ts b/enjoy/src/commands/text.command.ts
new file mode 100644
index 00000000..64d23445
--- /dev/null
+++ b/enjoy/src/commands/text.command.ts
@@ -0,0 +1,31 @@
+import { ChatOpenAI } from "@langchain/openai";
+
+export const textCommand = async (
+  prompt: string,
+  options: {
+    key: string;
+    modelName?: string;
+    temperature?: number;
+    baseUrl?: string;
+    systemPrompt?: string;
+  }
+): Promise<string> => {
+  const { key, temperature = 0, baseUrl } = options;
+  let { modelName = "gpt-4-turbo" } = options;
+
+  const chatModel = new ChatOpenAI({
+    openAIApiKey: key,
+    modelName,
+    temperature,
+    configuration: {
+      baseURL: baseUrl,
+    },
+    cache: false,
+    verbose: true,
+    maxRetries: 1,
+  });
+
+  const response = await chatModel.invoke(prompt);
+
+  return response.text;
+};
diff --git a/enjoy/src/commands/translate.command.ts b/enjoy/src/commands/translate.command.ts
index f7839394..b6460590 100644
--- a/enjoy/src/commands/translate.command.ts
+++ b/enjoy/src/commands/translate.command.ts
@@ -1,5 +1,5 @@
-import { ChatOpenAI } from "@langchain/openai";
 import { ChatPromptTemplate } from "@langchain/core/prompts";
+import { textCommand } from "./text.command";
 
 export const translateCommand = async (
   text: string,
@@ -10,32 +10,17 @@ export const translateCommand = async (
     baseUrl?: string;
   }
 ): Promise<string> => {
-  const { key, temperature = 0, baseUrl } = options;
-  let { modelName = "gpt-4-turbo" } = options;
+  if (!text) throw new Error("Text is required");
 
-  const chatModel = new ChatOpenAI({
-    openAIApiKey: key,
-    modelName,
-    temperature,
-    configuration: {
-      baseURL: baseUrl,
-    },
-    cache: false,
-    verbose: true,
-    maxRetries: 2,
-  });
-
-  const prompt = ChatPromptTemplate.fromMessages([
+  const prompt = await ChatPromptTemplate.fromMessages([
     ["system", SYSTEM_PROMPT],
     ["human", TRANSLATION_PROMPT],
-  ]);
-
-  const response = await prompt.pipe(chatModel).invoke({
+  ]).format({
     native_language: "Chinese",
     text,
   });
 
-  return response.text;
+  return textCommand(prompt, options);
 };
 
 const SYSTEM_PROMPT =
diff --git a/enjoy/src/i18n/en.json b/enjoy/src/i18n/en.json
index b67f0c1b..d87a284c 100644
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -570,5 +570,6 @@
   "noNotesYet": "No notes yet",
   "editTranscription": "Edit transcription",
   "saveTranscription": "Save transcription",
-  "areYouSureToSaveTranscription": "It will perform a force-alignment between the audio and your edited transcription. Are you sure to continue?"
+  "areYouSureToSaveTranscription": "It will perform a force-alignment between the audio and your edited transcription. Are you sure to continue?",
+  "summarize": "Summarize"
 }
diff --git a/enjoy/src/i18n/zh-CN.json b/enjoy/src/i18n/zh-CN.json
index d554cfd4..487e4e1a 100644
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -569,5 +569,6 @@
   "noNotesYet": "还没有笔记",
   "editTranscription": "编辑语音文本",
   "saveTranscription": "保存语音文本",
-  "areYouSureToSaveTranscription": "即将根据您修改后的语音文本对语音重新进行对齐，确定要继续吗？"
+  "areYouSureToSaveTranscription": "即将根据您修改后的语音文本对语音重新进行对齐，确定要继续吗？",
+  "summarize": "提炼主题"
 }
diff --git a/enjoy/src/renderer/components/medias/media-info-panel.tsx b/enjoy/src/renderer/components/medias/media-info-panel.tsx
index 4f865173..b00e1148 100644
--- a/enjoy/src/renderer/components/medias/media-info-panel.tsx
+++ b/enjoy/src/renderer/components/medias/media-info-panel.tsx
@@ -1,16 +1,66 @@
-import { useContext } from "react";
-import { MediaPlayerProviderContext } from "@renderer/context";
+import { useContext, useState } from "react";
+import {
+  AppSettingsProviderContext,
+  MediaPlayerProviderContext,
+} from "@renderer/context";
 import { formatDuration, formatDateTime } from "@renderer/lib/utils";
 import { t } from "i18next";
+import { Button, toast } from "@renderer/components/ui";
+import { useAiCommand } from "@renderer/hooks";
+import { LoaderIcon } from "lucide-react";
 
 export const MediaInfoPanel = () => {
-  const { media } = useContext(MediaPlayerProviderContext);
+  const { media, transcription } = useContext(MediaPlayerProviderContext);
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const { summarizeTopic } = useAiCommand();
+  const [summarizing, setSummarizing] = useState<boolean>(false);
+
+  const handleSummarize = async () => {
+    setSummarizing(true);
+
+    try {
+      const topic = await summarizeTopic(transcription.result.transcript);
+      if (media.mediaType === "Video") {
+        await EnjoyApp.videos.update(media.id, {
+          name: topic,
+        });
+      } else if (media.mediaType === "Audio") {
+        await EnjoyApp.audios.update(media.id, {
+          name: topic,
+        });
+      }
+    } catch (error) {
+      toast.error(error);
+    }
+
+    setSummarizing(false);
+  };
+
   if (!media) return null;
 
   return (
     <div className="px-4" data-testid="media-info-panel">
+      <div className="mb-2">
+        <div className="flex items-center justify-between">
+          <div className="capitalize text-sm text-muted-foreground mb-1">
+            {t("models.audio.name")}
+          </div>
+          <Button
+            disabled={summarizing}
+            onClick={handleSummarize}
+            variant="outline"
+            size="sm"
+          >
+            {summarizing && (
+              <LoaderIcon className="animate-spin mr-2" size={16} />
+            )}
+            {t("summarize")}
+          </Button>
+        </div>
+        <div className="">{media.name}</div>
+      </div>
+
       {[
-        { label: t("models.audio.name"), value: media.name },
         {
           label: t("models.audio.duration"),
           value: formatDuration(media.duration),
diff --git a/enjoy/src/renderer/components/messages/assistant-message.tsx b/enjoy/src/renderer/components/messages/assistant-message.tsx
index 1b4c9488..06f9b23b 100644
--- a/enjoy/src/renderer/components/messages/assistant-message.tsx
+++ b/enjoy/src/renderer/components/messages/assistant-message.tsx
@@ -34,7 +34,7 @@ import { useCopyToClipboard } from "@uidotdev/usehooks";
 import { t } from "i18next";
 import { AppSettingsProviderContext } from "@renderer/context";
 import Markdown from "react-markdown";
-import { useConversation } from "@renderer/hooks";
+import { useConversation, useAiCommand } from "@renderer/hooks";
 
 export const AssistantMessageComponent = (props: {
   message: MessageType;
@@ -52,6 +52,7 @@ export const AssistantMessageComponent = (props: {
   const [shadowing, setShadowing] = useState<boolean>(false);
   const { EnjoyApp } = useContext(AppSettingsProviderContext);
   const { tts } = useConversation();
+  const { summarizeTopic } = useAiCommand();
 
   useEffect(() => {
     if (speech) return;
@@ -100,11 +101,19 @@ export const AssistantMessageComponent = (props: {
 
     if (!audio) {
       setResourcing(true);
+      let title =
+        speech.text.length > 20
+          ? speech.text.substring(0, 17).trim() + "..."
+          : speech.text;
+
+      try {
+        title = await summarizeTopic(speech.text);
+      } catch (e) {
+        console.warn(e);
+      }
+
       await EnjoyApp.audios.create(speech.filePath, {
-        name:
-          speech.text.length > 20
-            ? speech.text.substring(0, 17).trim() + "..."
-            : speech.text,
+        name: title,
         originalText: speech.text,
       });
       setResourcing(false);
@@ -169,7 +178,7 @@ export const AssistantMessageComponent = (props: {
                   new URL(props.href ?? "");
                   props.target = "_blank";
                   props.rel = "noopener noreferrer";
-                } catch (e) { }
+                } catch (e) {}
 
                 return <a {...props}>{children}</a>;
               },
diff --git a/enjoy/src/renderer/context/media-player-provider.tsx b/enjoy/src/renderer/context/media-player-provider.tsx
index ee268947..adc3e9d0 100644
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -510,7 +510,7 @@ export const MediaPlayerProvider = ({
     EnjoyApp.waveforms.find(media.md5).then((waveform) => {
       setWaveForm(waveform);
     });
-  }, [media]);
+  }, [media?.md5]);
 
   /*
    * Initialize wavesurfer when container ref is available
@@ -524,7 +524,7 @@ export const MediaPlayerProvider = ({
       setDecoded(false);
       setDecodeError(null);
     };
-  }, [media, ref, mediaProvider, layout?.playerHeight]);
+  }, [media?.src, ref, mediaProvider, layout?.playerHeight]);
 
   useEffect(() => {
     calculateHeight();
diff --git a/enjoy/src/renderer/hooks/use-ai-command.tsx b/enjoy/src/renderer/hooks/use-ai-command.tsx
index c337655e..7c36a096 100644
--- a/enjoy/src/renderer/hooks/use-ai-command.tsx
+++ b/enjoy/src/renderer/hooks/use-ai-command.tsx
@@ -9,6 +9,7 @@ import {
   translateCommand,
   analyzeCommand,
   punctuateCommand,
+  summarizeTopicCommand,
 } from "@commands";
 
 export const useAiCommand = () => {
@@ -108,14 +109,23 @@ export const useAiCommand = () => {
       key: currentEngine.key,
       modelName: currentEngine.model,
       baseUrl: currentEngine.baseUrl,
-    })
-  }
+    });
+  };
+
+  const summarizeTopic = async (text: string) => {
+    return summarizeTopicCommand(text, {
+      key: currentEngine.key,
+      modelName: currentEngine.model,
+      baseUrl: currentEngine.baseUrl,
+    });
+  };
 
   return {
     lookupWord,
     extractStory,
     translate,
     analyzeText,
-    punctuateText
+    punctuateText,
+    summarizeTopic,
   };
 };
diff --git a/enjoy/src/renderer/hooks/use-audio.tsx b/enjoy/src/renderer/hooks/use-audio.tsx
index 787b6b60..45506bd8 100644
--- a/enjoy/src/renderer/hooks/use-audio.tsx
+++ b/enjoy/src/renderer/hooks/use-audio.tsx
@@ -5,17 +5,20 @@ import {
 } from "@renderer/context";
 import { toast } from "@renderer/components/ui";
 import { t } from "i18next";
+import { useThrottle } from "@uidotdev/usehooks";
 
 export const useAudio = (options: { id?: string; md5?: string }) => {
   const { id, md5 } = options;
   const { EnjoyApp } = useContext(AppSettingsProviderContext);
   const { addDblistener, removeDbListener } = useContext(DbProviderContext);
   const [audio, setAudio] = useState<AudioType>(null);
+  const throttledAudio = useThrottle(audio, 500);
 
   const onAudioUpdate = (event: CustomEvent) => {
     const { model, action, record } = event.detail || {};
-    if (model !== "Audio") return;
-    if (record?.id != audio?.id) return;
+    if (model != "Audio") return;
+    if (id && record.id !== id) return;
+    if (md5 && record.md5 !== md5) return;
     if (action !== "update") return;
 
     setAudio(record);
@@ -38,6 +41,6 @@ export const useAudio = (options: { id?: string; md5?: string }) => {
   }, [id, md5]);
 
   return {
-    audio,
+    audio: throttledAudio,
   };
 };
diff --git a/enjoy/src/renderer/hooks/use-video.tsx b/enjoy/src/renderer/hooks/use-video.tsx
index 02fb091d..4d862ba1 100644
--- a/enjoy/src/renderer/hooks/use-video.tsx
+++ b/enjoy/src/renderer/hooks/use-video.tsx
@@ -5,17 +5,20 @@ import {
 } from "@renderer/context";
 import { toast } from "@renderer/components/ui";
 import { t } from "i18next";
+import { useThrottle } from "@uidotdev/usehooks";
 
 export const useVideo = (options: { id?: string; md5?: string }) => {
   const { id, md5 } = options;
   const { EnjoyApp } = useContext(AppSettingsProviderContext);
   const { addDblistener, removeDbListener } = useContext(DbProviderContext);
   const [video, setVideo] = useState<VideoType>(null);
+  const throttledVideo = useThrottle(video, 500);
 
   const onAudioUpdate = (event: CustomEvent) => {
     const { model, action, record } = event.detail || {};
-    if (model !== "Audio") return;
-    if (record?.id != video?.id) return;
+    if (model !== "Video") return;
+    if (id && record.id !== id) return;
+    if (md5 && record.md5 !== md5) return;
     if (action !== "update") return;
 
     setVideo(record);
@@ -38,6 +41,6 @@ export const useVideo = (options: { id?: string; md5?: string }) => {
   }, [id, md5]);
 
   return {
-    video,
+    video: throttledVideo,
   };
 };