From 5efc5fc1db5d389762ac733c3a2954d73ce26ba1 Mon Sep 17 00:00:00 2001
From: an-lee <an.lee.work@gmail.com>
Date: Fri, 19 Jan 2024 16:52:49 +0800
Subject: [PATCH] Feat: AI commands (#145)

* upgrade deps

* add extract command

* add lookup.command

* update lookup command

* fix locals

* may lookup one by one

* update lookup method

* add translate command

* cache translation by default

* open ai default settings

* use openai config in context

* refactor

* genreate ipa

* update UI

* handle ai generate fail
---
 enjoy/package.json                            |   1 +
 enjoy/src/api/client.ts                       |  25 +-
 enjoy/src/commands/extract-story.command.ts   |  71 +++++
 enjoy/src/commands/index.ts                   |   4 +
 enjoy/src/commands/ipa.command.ts             |  88 ++++++
 enjoy/src/commands/lookup.command.ts          | 145 +++++++++
 enjoy/src/commands/translate.command.ts       |  49 ++++
 enjoy/src/i18n/en.json                        |  20 +-
 enjoy/src/i18n/zh-CN.json                     |  22 +-
 enjoy/src/main/db/models/conversation.ts      |   3 +
 .../src/renderer/components/lookup-result.tsx |  99 ++++---
 .../components/meanings/meaning-card.tsx      |   2 +-
 .../components/medias/media-caption.tsx       | 274 ++++++++++++++----
 .../components/medias/media-player.tsx        |   1 +
 .../components/preferences/basic-settings.tsx | 170 ++++++++---
 .../src/renderer/components/stories/index.ts  |   1 +
 .../components/stories/story-toolbar.tsx      | 158 +++-------
 .../components/stories/story-viewer.tsx       |  23 +-
 .../stories/story-vocabulary-sheet.tsx        | 155 ++++++++++
 enjoy/src/renderer/pages/story-preview.tsx    |   4 +-
 enjoy/src/renderer/pages/story.tsx            | 207 ++++++++++---
 enjoy/src/types.d.ts                          |   2 +
 enjoy/src/types/story.d.ts                    |   8 +
 enjoy/tsconfig.json                           |   3 +-
 enjoy/vite.main.config.mts                    |   1 +
 enjoy/vite.renderer.config.mts                |   1 +
 yarn.lock                                     |   8 +
 27 files changed, 1227 insertions(+), 318 deletions(-)
 create mode 100644 enjoy/src/commands/extract-story.command.ts
 create mode 100644 enjoy/src/commands/index.ts
 create mode 100644 enjoy/src/commands/ipa.command.ts
 create mode 100644 enjoy/src/commands/lookup.command.ts
 create mode 100644 enjoy/src/commands/translate.command.ts
 create mode 100644 enjoy/src/renderer/components/stories/story-vocabulary-sheet.tsx
diff --git a/enjoy/package.json b/enjoy/package.json
index c4b69fcf..9d5307a9 100644
--- a/enjoy/package.json
+++ b/enjoy/package.json
@@ -112,6 +112,7 @@
     "fs-extra": "^11.2.0",
     "html-to-text": "^9.0.5",
     "i18next": "^23.7.16",
+    "js-md5": "^0.8.3",
     "langchain": "^0.1.4",
     "lodash": "^4.17.21",
     "lucide-react": "^0.312.0",
diff --git a/enjoy/src/api/client.ts b/enjoy/src/api/client.ts
index ebf90218..90246be6 100644
--- a/enjoy/src/api/client.ts
+++ b/enjoy/src/api/client.ts
@@ -172,6 +172,17 @@ export class Client {
     return this.api.post("/api/lookups", decamelizeKeys(params));
   }
 
+  updateLookup(
+    id: string,
+    params: {
+      meaning: Partial<MeaningType>;
+      sourceId?: string;
+      sourceType?: string;
+    }
+  ): Promise<LookupType> {
+    return this.api.put(`/api/lookups/${id}`, decamelizeKeys(params));
+  }
+
   lookupInBatch(
     lookups: {
       word: string;
@@ -185,8 +196,17 @@ export class Client {
     });
   }
 
-  extractVocabularyFromStory(storyId: string): Promise<string[]> {
-    return this.api.post(`/api/stories/${storyId}/extract_vocabulary`);
+  extractVocabularyFromStory(
+    storyId: string,
+    extraction?: {
+      words?: string[];
+      idioms?: string[];
+    }
+  ): Promise<string[]> {
+    return this.api.post(
+      `/api/stories/${storyId}/extract_vocabulary`,
+      decamelizeKeys({ extraction })
+    );
   }
 
   storyMeanings(
@@ -194,7 +214,6 @@ export class Client {
     params?: {
       page?: number;
       items?: number;
-      storyId?: string;
     }
   ): Promise<
     {
diff --git a/enjoy/src/commands/extract-story.command.ts b/enjoy/src/commands/extract-story.command.ts
new file mode 100644
index 00000000..2d728287
--- /dev/null
+++ b/enjoy/src/commands/extract-story.command.ts
@@ -0,0 +1,71 @@
+import { ChatOpenAI } from "@langchain/openai";
+import { ChatPromptTemplate } from "langchain/prompts";
+import { zodToJsonSchema } from "zod-to-json-schema";
+import { z } from "zod";
+
+export const extractStoryCommand = async (
+  content: string,
+  options: {
+    key: string;
+    modelName?: string;
+    temperature?: number;
+    baseUrl?: string;
+  }
+): Promise<{ words: string[]; idioms: string[] }> => {
+  const {
+    key,
+    modelName = "gpt-3.5-turbo-1106",
+    temperature = 0,
+    baseUrl,
+  } = options;
+
+  const saveExtraction = z.object({
+    words: z.array(z.string().describe("extracted word")),
+    idioms: z.array(z.string().describe("extracted idiom")),
+  });
+
+  const chatModel = new ChatOpenAI({
+    openAIApiKey: key,
+    modelName,
+    temperature,
+    modelKwargs: {
+      response_format: {
+        type: "json_object",
+      },
+    },
+    configuration: {
+      baseURL: baseUrl,
+    },
+    cache: true,
+    verbose: true,
+  }).bind({
+    tools: [
+      {
+        type: "function",
+        function: {
+          name: "save_extraction",
+          description: "Save the extracted words and idioms from a text",
+          parameters: zodToJsonSchema(saveExtraction),
+        },
+      },
+    ],
+  });
+
+  const prompt = ChatPromptTemplate.fromMessages([
+    ["system", EXTRACT_STORY_PROMPT],
+    ["human", "{text}"],
+  ]);
+
+  const response = await prompt.pipe(chatModel).invoke({
+    learning_language: "English",
+    text: content,
+  });
+
+  return JSON.parse(
+    response.additional_kwargs?.tool_calls?.[0]?.function?.arguments || "{}"
+  );
+};
+
+const EXTRACT_STORY_PROMPT = `
+I am an {learning_language} beginner and only have a grasp of 500 high-frequency basic words. You are an {learning_language} learning assistant robot, and your task is to analyze the article I provide and extract all the meaningful words and idioms that I may not be familiar with. Specifically, it should include common words used in uncommon ways. Return in JSON format.
+`;
diff --git a/enjoy/src/commands/index.ts b/enjoy/src/commands/index.ts
new file mode 100644
index 00000000..82846215
--- /dev/null
+++ b/enjoy/src/commands/index.ts
@@ -0,0 +1,4 @@
+export * from "./extract-story.command";
+export * from "./lookup.command";
+export * from "./translate.command";
+export * from "./ipa.command";
diff --git a/enjoy/src/commands/ipa.command.ts b/enjoy/src/commands/ipa.command.ts
new file mode 100644
index 00000000..215d803a
--- /dev/null
+++ b/enjoy/src/commands/ipa.command.ts
@@ -0,0 +1,88 @@
+import { ChatOpenAI } from "@langchain/openai";
+import { ChatPromptTemplate } from "langchain/prompts";
+import { z } from "zod";
+import {
+  StructuredOutputParser,
+  OutputFixingParser,
+} from "langchain/output_parsers";
+
+export const ipaCommand = async (
+  text: string,
+  options: {
+    key: string;
+    modelName?: string;
+    temperature?: number;
+    baseUrl?: string;
+  }
+): Promise<{ words?: { word?: string; ipa?: string }[] }> => {
+  const {
+    key,
+    modelName = "gpt-3.5-turbo-1106",
+    temperature = 0,
+    baseUrl,
+  } = options;
+
+  const responseSchema = z.object({
+    words: z.array(
+      z.object({
+        word: z.string().nonempty(),
+        ipa: z.string().nonempty(),
+      })
+    ),
+  });
+
+  const parser = StructuredOutputParser.fromZodSchema(responseSchema);
+  const fixParser = OutputFixingParser.fromLLM(
+    new ChatOpenAI({
+      openAIApiKey: key,
+      temperature: 0,
+      configuration: {
+        baseURL: baseUrl,
+      },
+    }),
+    parser
+  );
+
+  const chatModel = new ChatOpenAI({
+    openAIApiKey: key,
+    modelName,
+    temperature,
+    configuration: {
+      baseURL: baseUrl,
+    },
+    modelKwargs: {
+      response_format: {
+        type: "json_object",
+      },
+    },
+    cache: true,
+    verbose: true,
+  });
+
+  const prompt = ChatPromptTemplate.fromMessages([
+    ["system", SYSTEM_PROMPT],
+    ["human", "{text}"],
+  ]);
+
+  const response = await prompt.pipe(chatModel).invoke({
+    learning_language: "English",
+    text,
+  });
+
+  try {
+    return await parser.parse(response.text);
+  } catch (e) {
+    return await fixParser.parse(response.text);
+  }
+};
+
+const SYSTEM_PROMPT = `Generate an array of JSON objects for each {learning_language} word in the given text, with each object containing two keys: 'word' and 'ipa', where 'ipa' is the International Phonetic Alphabet (IPA) representation of the word. Return the array in JSON format only. The output should be structured like this:
+
+{{
+  words: [
+    {{
+    word: "word",
+    ipa: "ipa"
+    }}
+  ]
+}}`;
diff --git a/enjoy/src/commands/lookup.command.ts b/enjoy/src/commands/lookup.command.ts
new file mode 100644
index 00000000..d15f7ec0
--- /dev/null
+++ b/enjoy/src/commands/lookup.command.ts
@@ -0,0 +1,145 @@
+import { ChatOpenAI } from "@langchain/openai";
+import { ChatPromptTemplate } from "langchain/prompts";
+import { z } from "zod";
+import {
+  StructuredOutputParser,
+  OutputFixingParser,
+} from "langchain/output_parsers";
+
+export const lookupCommand = async (
+  params: {
+    word: string;
+    context: string;
+    meaningOptions?: Partial<MeaningType>[];
+  },
+  options: {
+    key: string;
+    modelName?: string;
+    temperature?: number;
+    baseUrl?: string;
+  }
+): Promise<{
+  id?: string;
+  word?: string;
+  context_translation?: string;
+  pos?: string;
+  pronunciation?: string;
+  definition?: string;
+  translation?: string;
+  lemma?: string;
+}> => {
+  const {
+    key,
+    modelName = "gpt-3.5-turbo-1106",
+    temperature = 0,
+    baseUrl,
+  } = options;
+  const { word, context, meaningOptions } = params;
+
+  const responseSchema = z.object({
+    id: z.string().optional(),
+    word: z.string().optional(),
+    context_translation: z.string().optional(),
+    pos: z.string().optional(),
+    pronunciation: z.string().optional(),
+    definition: z.string().optional(),
+    translation: z.string().optional(),
+    lemma: z.string().optional(),
+  });
+
+  const parser = StructuredOutputParser.fromZodSchema(responseSchema);
+  const fixParser = OutputFixingParser.fromLLM(
+    new ChatOpenAI({
+      openAIApiKey: key,
+      temperature: 0,
+      configuration: {
+        baseURL: baseUrl,
+      },
+    }),
+    parser
+  );
+
+  const chatModel = new ChatOpenAI({
+    openAIApiKey: key,
+    modelName,
+    temperature,
+    configuration: {
+      baseURL: baseUrl,
+    },
+    cache: true,
+    verbose: true,
+  });
+
+  const prompt = ChatPromptTemplate.fromMessages([
+    ["system", DICITIONARY_PROMPT],
+    ["human", "{input}"],
+  ]);
+
+  const response = await prompt.pipe(chatModel).invoke({
+    learning_language: "English",
+    native_language: "Chinese",
+    input: JSON.stringify({
+      word,
+      context,
+      definitions: meaningOptions,
+    }),
+  });
+
+  try {
+    return await parser.parse(response.text);
+  } catch (e) {
+    return await fixParser.parse(response.text);
+  }
+};
+
+const DICITIONARY_PROMPT = `You are an {learning_language}-{native_language} dictionary. I will provide "word(it also maybe a phrase)" and "context" as input, you should return the "word", "lemma", "pronunciation", "pos(part of speech, maybe empty for phrase)", "definition", "translation" and "context_translation" as output. If I provide "definitions", you should try to select the appropriate one for the given context, and return the id of selected definition as "id". If none are suitable, generate a new definition for me. If no context is provided, return the most common definition. If you do not know the appropriate definition, return an empty string for "definition" and "translation".
+      Always return output in JSON format.
+      
+      # Example 1, with empty definitions
+      <input>
+        {{
+          "word": "booked",
+          "context": "She'd *booked* a table for four at their favourite restaurant.",
+          "definitions": []
+        }}
+      </input>
+      
+      <output> 
+      {{
+        "word": "booked",
+        "lemma": "book",
+        "pronunciation": "bʊk",
+        "pos": "verb",
+        "definition": "to arrange to have a seat, room, performer, etc. at a particular time in the future",
+        "translation": "预订",
+        "context_translation": "她已经在他们最喜欢的餐厅预订了四人桌位。"
+      }}
+      </output> 
+      
+      # Example 2, with definitions
+      <input>
+      {{
+        "word": "booked",
+        "context": "She'd *booked* a table for four at their favourite restaurant.",
+        "definitions": [
+          {{
+            "id": "767ddbf3-c08a-42e1-95c8-c48e681f3486",
+            "pos": "noun",
+            "definition": "a written text that can be published in printed or electronic form",
+          }},
+          {{
+            "id": "37940295-ef93-4873-af60-f03bf7e271f0",
+            "pos": "verb",
+            "definition": "to arrange to have a seat, room, performer, etc. at a particular time in the future",
+          }}
+        ]
+      }}
+      </input>
+      
+      <output>
+        {{
+          "id": "37940295-ef93-4873-af60-f03bf7e271f0",
+          "context_translation": "她已经在他们最喜欢的餐厅预订了四人桌位。"
+        }}
+      </output> 
+  `;
diff --git a/enjoy/src/commands/translate.command.ts b/enjoy/src/commands/translate.command.ts
new file mode 100644
index 00000000..07020917
--- /dev/null
+++ b/enjoy/src/commands/translate.command.ts
@@ -0,0 +1,49 @@
+import { ChatOpenAI } from "@langchain/openai";
+import { ChatPromptTemplate } from "langchain/prompts";
+
+export const translateCommand = async (
+  text: string,
+  options: {
+    key: string;
+    modelName?: string;
+    temperature?: number;
+    baseUrl?: string;
+  }
+): Promise<string> => {
+  const {
+    key,
+    modelName = "gpt-3.5-turbo-1106",
+    temperature = 0,
+    baseUrl,
+  } = options;
+
+  const chatModel = new ChatOpenAI({
+    openAIApiKey: key,
+    modelName,
+    temperature,
+    configuration: {
+      baseURL: baseUrl,
+    },
+    cache: true,
+    verbose: true,
+  });
+
+  const prompt = ChatPromptTemplate.fromMessages([
+    ["system", SYSTEM_PROMPT],
+    ["human", TRANSLATION_PROMPT],
+  ]);
+
+  const response = await prompt.pipe(chatModel).invoke({
+    native_language: "Chinese",
+    text,
+  });
+
+  return response.text;
+};
+
+const SYSTEM_PROMPT =
+  "You are a professional, authentic translation engine, only returns translations.";
+const TRANSLATION_PROMPT = `Translate the text to {native_language} Language, please do not explain my original text.:
+
+{text}
+`;
diff --git a/enjoy/src/i18n/en.json b/enjoy/src/i18n/en.json
index bd66a496..c7e4fc81 100644
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -87,7 +87,7 @@
       "ttsBaseUrl": "TTS base URL",
       "notFound": "Conversation not found",
       "contentRequired": "Content required",
-      "failedToGenerateResponse": "Failed to generate response"
+      "failedToGenerateResponse": "Failed to generate response, please retry"
     },
     "pronunciationAssessment": {
       "pronunciationScore": "Pronunciation Score",
@@ -156,6 +156,8 @@
   "autoCenter": "auto center",
   "inlineCaption": "inline caption",
   "autoScroll": "auto scroll",
+  "translate:": "translate",
+  "displayIpa": "display IPA",
   "detail": "detail",
   "remove": "remove",
   "share": "share",
@@ -295,7 +297,12 @@
   "whisperIsNotWorking": "Whisper is not working",
   "relaunchIsNeededAfterChanged": "Relaunch is needed after changed",
   "openaiKeySaved": "OpenAI key saved",
+  "openaiConfigSaved": "OpenAI config saved",
   "openaiKeyRequired": "OpenAI key required",
+  "baseUrl": "baseURL",
+  "model": "model",
+  "key": "key",
+  "leaveEmptyToUseDefault": "Leave empty to use default",
   "newConversation": "New conversation",
   "startConversation": "Start conversation",
   "editConversation": "Edit conversation",
@@ -336,8 +343,17 @@
   "backSide": "back side",
   "aiExtractVocabulary": "AI extract vocabulary",
   "toggleReadable": "Toggle readable",
+  "extracting": "Extracting",
+  "extractionFailed": "Extraction failed",
+  "extractedSuccessfully": "Extracted successfully",
+  "lookUp": "Look up",
+  "lookUpAll": "Look up all",
   "lookingUp": "Looking up",
-  "thereAreLookupsPending": "There are {{count}} lookups pending",
+  "pending": "Pending",
+  "thereAreLookupsProcessing": "There are {{count}} lookups processing",
+  "thereAreLookupsPending": "There are {{count}} lookups waiting",
+  "lookupFailed": "Lookup failed",
+  "lookedUpSuccessfully": "Looked up successfully",
   "noRecordsFound": "No records found",
   "pleaseTryLater": "Please try later",
   "author": "author",
diff --git a/enjoy/src/i18n/zh-CN.json b/enjoy/src/i18n/zh-CN.json
index d58c4f65..e040d1e8 100644
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -65,7 +65,7 @@
     "conversation": {
       "name": "对话标题",
       "engine": "AI 引擎",
-      "baseUrl": "请求地址",
+      "baseUrl": "接口地址",
       "configuration": "AI 配置",
       "model": "AI 模型",
       "roleDefinition": "角色定义",
@@ -87,7 +87,7 @@
       "ttsBaseUrl": "TTS 请求地址",
       "notFound": "未找到对话",
       "contentRequired": "对话内容不能为空",
-      "failedToGenerateResponse": "生成失败"
+      "failedToGenerateResponse": "生成失败，请重试"
     },
     "pronunciationAssessment": {
       "pronunciationScore": "发音得分",
@@ -156,6 +156,8 @@
   "autoCenter": "自动居中",
   "inlineCaption": "内联字幕",
   "autoScroll": "自动滚动",
+  "translate:": "翻译",
+  "displayIpa": "标注音标",
   "detail": "详情",
   "remove": "删除",
   "share": "分享",
@@ -294,7 +296,12 @@
   "whisperIsNotWorking": "Whisper 无法正常工作，请尝试更换模型后重试，或联系开发者",
   "relaunchIsNeededAfterChanged": "更改后需要重新启动",
   "openaiKeySaved": "OpenAI 密钥已保存",
+  "openaiConfigSaved": "OpenAI 配置已保存",
   "openaiKeyRequired": "未提供 OpenAI 密钥",
+  "baseUrl": "接口地址",
+  "model": "模型",
+  "key": "密钥",
+  "leaveEmptyToUseDefault": "留空则使用默认值",
   "newConversation": "新对话",
   "startConversation": "开始对话",
   "editConversation": "编辑对话",
@@ -335,8 +342,17 @@
   "backSide": "反面",
   "aiExtractVocabulary": "AI 提取生词",
   "toggleReadable": "切换阅读模式",
+  "extracting": "正在提取",
+  "extractionFailed": "提取失败",
+  "extractedSuccessfully": "提取成功",
+  "lookUp": "查询",
+  "lookUpAll": "全部查询",
   "lookingUp": "正在查询",
-  "thereAreLookupsPending": "有{{count}}个单词正在查询",
+  "pending": "等待中",
+  "thereAreLookupsProcessing": "有{{count}}个单词正在查询",
+  "thereAreLookupsPending": "有{{count}}个单词正在等待查询",
+  "lookupFailed": "查询失败",
+  "lookedUpSuccessfully": "查询成功",
   "noRecordsFound": "没有找到记录",
   "pleaseTryLater": "请稍后再试",
   "author": "作者",
diff --git a/enjoy/src/main/db/models/conversation.ts b/enjoy/src/main/db/models/conversation.ts
index 47a9d34d..53de178d 100644
--- a/enjoy/src/main/db/models/conversation.ts
+++ b/enjoy/src/main/db/models/conversation.ts
@@ -297,6 +297,9 @@ export class Conversation extends Model<Conversation> {
 
     const replies = await Promise.all(
       response.map(async (generation) => {
+        if (!generation?.text) {
+          throw new Error(t("models.conversation.failedToGenerateResponse"));
+        }
         return await Message.create(
           {
             conversationId: this.id,
diff --git a/enjoy/src/renderer/components/lookup-result.tsx b/enjoy/src/renderer/components/lookup-result.tsx
index bc2ab41b..4a97e90b 100644
--- a/enjoy/src/renderer/components/lookup-result.tsx
+++ b/enjoy/src/renderer/components/lookup-result.tsx
@@ -1,9 +1,14 @@
-import { AppSettingsProviderContext } from "@renderer/context";
+import {
+  AppSettingsProviderContext,
+  AISettingsProviderContext,
+} from "@renderer/context";
 import { useState, useContext, useEffect } from "react";
 import { LoaderSpin, MeaningCard } from "@renderer/components";
 import { Button } from "@renderer/components/ui";
 import { t } from "i18next";
 import { XCircleIcon } from "lucide-react";
+import { toast } from "@renderer/components/ui";
+import { lookupCommand } from "@commands";
 
 export const LookupResult = (props: {
   word: string;
@@ -13,49 +18,70 @@ export const LookupResult = (props: {
   onResult?: (meaning: MeaningType) => void;
 }) => {
   const { word, context, sourceId, sourceType, onResult } = props;
-  const [timer, setTimer] = useState<NodeJS.Timeout>();
   const [result, setResult] = useState<LookupType>();
   const [loading, setLoading] = useState<boolean>(true);
   if (!word) return null;
 
   const { webApi } = useContext(AppSettingsProviderContext);
+  const { openai } = useContext(AISettingsProviderContext);
 
-  const lookup = (retries = 0) => {
+  const processLookup = async () => {
     if (!word) return;
-    if (retries > 3) {
-      setLoading(false);
-      return;
-    }
+    if (!loading) return;
 
-    retries += 1;
-    webApi
-      .lookup({
-        word,
-        context,
-        sourceId,
-        sourceType,
-      })
-      .then((res) => {
-        if (res?.meaning) {
-          setResult(res);
-          setLoading(false);
-          onResult && onResult(res.meaning);
-        } else {
-          // Retry after 1.5s
-          const _timeout = setTimeout(() => {
-            lookup(retries);
-          }, 1500);
-          setTimer(_timeout);
+    setLoading(true);
+    const lookup = await webApi.lookup({
+      word,
+      context,
+      sourceId,
+      sourceType,
+    });
+
+    if (lookup.meaning) {
+      setResult(lookup);
+      setLoading(false);
+      onResult && onResult(lookup.meaning);
+    } else {
+      if (!openai?.key) {
+        toast.error(t("openaiApiKeyRequired"));
+        return;
+      }
+
+      lookupCommand(
+        {
+          word,
+          context,
+          meaningOptions: lookup.meaningOptions,
+        },
+        {
+          key: openai.key,
         }
-      });
+      )
+        .then((res) => {
+          if (res.context_translation?.trim()) {
+            webApi
+              .updateLookup(lookup.id, {
+                meaning: res,
+                sourceId,
+                sourceType,
+              })
+              .then((lookup) => {
+                setResult(lookup);
+                onResult && onResult(lookup.meaning);
+              });
+          }
+        })
+        .catch((err) => {
+          toast.error(`${t("lookupFailed")}: ${err.message}`);
+        })
+        .finally(() => {
+          setLoading(false);
+        });
+    }
   };
 
   useEffect(() => {
-    lookup();
-
-    return () => {
-      if (timer) clearTimeout(timer);
-    };
+    processLookup();
   }, [word, context]);
 
   if (result?.meaning) {
@@ -95,14 +121,7 @@ export const LookupResult = (props: {
     <div className="px-4 py-2">
       <div className="font-bold mb-4">{word}</div>
       <div className="flex justify-center">
-        <Button
-          onClick={() => {
-            setLoading(true);
-            lookup();
-          }}
-          variant="default"
-          size="sm"
-        >
+        <Button onClick={processLookup} variant="default" size="sm">
           {t("retry")}
         </Button>
       </div>
diff --git a/enjoy/src/renderer/components/meanings/meaning-card.tsx b/enjoy/src/renderer/components/meanings/meaning-card.tsx
index 12ce1acf..362a4149 100644
--- a/enjoy/src/renderer/components/meanings/meaning-card.tsx
+++ b/enjoy/src/renderer/components/meanings/meaning-card.tsx
@@ -23,7 +23,7 @@ export const MeaningCard = (props: {
   const lookups = [lookup, ..._lookups].filter(Boolean);
 
   return (
-    <div className="">
+    <div className="select-text ">
       <div className="font-bold mb-2">{word}</div>
       <div className="mb-2">
         {pos && (
diff --git a/enjoy/src/renderer/components/medias/media-caption.tsx b/enjoy/src/renderer/components/medias/media-caption.tsx
index ba568706..6b9b7735 100644
--- a/enjoy/src/renderer/components/medias/media-caption.tsx
+++ b/enjoy/src/renderer/components/medias/media-caption.tsx
@@ -1,13 +1,31 @@
-import { useState, useEffect } from "react";
+import { useState, useEffect, useContext } from "react";
 import { cn } from "@renderer/lib/utils";
 import {
   Button,
+  DropdownMenu,
+  DropdownMenuContent,
+  DropdownMenuItem,
+  DropdownMenuTrigger,
   Popover,
   PopoverContent,
   PopoverAnchor,
+  toast,
 } from "@renderer/components/ui";
 import { LookupResult } from "@renderer/components";
-import { LanguagesIcon, PlayIcon } from "lucide-react";
+import {
+  ChevronDownIcon,
+  LanguagesIcon,
+  PlayIcon,
+  LoaderIcon,
+  SpeechIcon,
+} from "lucide-react";
+import { translateCommand, ipaCommand } from "@commands";
+import {
+  AppSettingsProviderContext,
+  AISettingsProviderContext,
+} from "@renderer/context";
+import { t } from "i18next";
+import { md5 } from "js-md5";
 
 export const MediaCaption = (props: {
   mediaId: string;
@@ -36,6 +54,92 @@ export const MediaCaption = (props: {
       left: number;
     };
   }>();
+  const [translation, setTranslation] = useState<string>();
+  const [translating, setTranslating] = useState<boolean>(false);
+  const [displayTranslation, setDisplayTranslation] = useState<boolean>(false);
+
+  const [ipa, setIpa] = useState<{ word?: string; ipa?: string }[]>([]);
+  const [ipaGenerating, setIpaGenerating] = useState<boolean>(false);
+  const [displayIpa, setDisplayIpa] = useState<boolean>(false);
+
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const { openai } = useContext(AISettingsProviderContext);
+
+  const toogleIPA = async () => {
+    if (ipaGenerating) return;
+
+    if (ipa.length > 0) {
+      setDisplayIpa(!displayIpa);
+      return;
+    }
+
+    const hash = md5.create();
+    hash.update(transcription.text);
+    const cacheKey = `ipa-${hash.hex()}`;
+    const cached = await EnjoyApp.cacheObjects.get(cacheKey);
+    if (cached) {
+      setIpa(cached);
+      return;
+    }
+
+    if (!openai?.key) {
+      toast.error(t("openaiApiKeyRequired"));
+      return;
+    }
+    setIpaGenerating(true);
+
+    ipaCommand(transcription.text, {
+      key: openai.key,
+    })
+      .then((result) => {
+        if (result?.words?.length > 0) {
+          setIpa(result.words);
+          EnjoyApp.cacheObjects.set(cacheKey, result.words);
+          setDisplayIpa(true);
+        }
+      })
+      .finally(() => {
+        setIpaGenerating(false);
+      });
+  };
+
+  const translate = async () => {
+    if (translating) return;
+
+    if (translation) {
+      setDisplayTranslation(!displayTranslation);
+      return;
+    }
+
+    const hash = md5.create();
+    hash.update(transcription.text);
+    const cacheKey = `translate-${hash.hex()}`;
+    const cached = await EnjoyApp.cacheObjects.get(cacheKey);
+    if (cached) {
+      setTranslation(cached);
+      return;
+    }
+
+    if (!openai?.key) {
+      toast.error(t("openaiApiKeyRequired"));
+      return;
+    }
+    setTranslating(true);
+
+    translateCommand(transcription.text, {
+      key: openai.key,
+    })
+      .then((result) => {
+        if (result) {
+          setTranslation(result);
+          EnjoyApp.cacheObjects.set(cacheKey, result);
+          setDisplayTranslation(true);
+        }
+      })
+      .finally(() => {
+        setTranslating(false);
+      });
+  };
 
   useEffect(() => {
     if (!transcription) return;
@@ -54,67 +158,117 @@ export const MediaCaption = (props: {
 
   return (
     <div className={cn("relative px-4 py-2 text-lg", className)}>
-      <div className="flex flex-wrap">
-        {(transcription.segments || []).map((w, index) => (
-          <span
-            key={index}
-            className={`mr-1 cursor-pointer hover:bg-red-500/10 ${
-              index === activeIndex ? "text-red-500" : ""
-            }`}
-            onClick={(event) => {
-              setSelected({
-                index,
-                word: w.text,
-                position: {
-                  top:
-                    event.currentTarget.offsetTop +
-                    event.currentTarget.offsetHeight,
-                  left: event.currentTarget.offsetLeft,
-                },
-              });
+      <div className="flex items-start space-x-4">
+        <div className="flex-1">
+          <div className="flex flex-wrap">
+            {(transcription.segments || []).map((w, index) => (
+              <div
+                key={index}
+                className={`mr-1 cursor-pointer hover:bg-red-500/10 ${
+                  index === activeIndex ? "text-red-500" : ""
+                }`}
+                onClick={(event) => {
+                  setSelected({
+                    index,
+                    word: w.text,
+                    position: {
+                      top:
+                        event.currentTarget.offsetTop +
+                        event.currentTarget.offsetHeight,
+                      left: event.currentTarget.offsetLeft,
+                    },
+                  });
 
-              setIsPlaying(false);
-              if (onSeek) onSeek(w.offsets.from / 1000);
-            }}
-          >
-            {w.text}
-          </span>
-        ))}
-
-        <Popover
-          open={Boolean(selected) && !isPlaying}
-          onOpenChange={(value) => {
-            if (!value) setSelected(null);
-          }}
-        >
-          <PopoverAnchor
-            className="absolute w-0 h-0"
-            style={{
-              top: selected?.position?.top,
-              left: selected?.position?.left,
-            }}
-          ></PopoverAnchor>
-          <PopoverContent
-            className="w-full max-w-md p-0"
-            updatePositionStrategy="always"
-          >
-            {selected?.word && (
-              <ResourceCaptionSelectionMenu
-                word={selected.word}
-                context={transcription.segments
-                  .map((w) => w.text)
-                  .join(" ")
-                  .trim()}
-                mediaId={props.mediaId}
-                mediaType={props.mediaType}
-                onPlay={() => {
-                  setIsPlaying(true);
+                  setIsPlaying(false);
+                  if (onSeek) onSeek(w.offsets.from / 1000);
                 }}
-              />
-            )}
-          </PopoverContent>
-        </Popover>
+              >
+                <div>{w.text}</div>
+                {displayIpa &&
+                  ipa.find(
+                    (i) =>
+                      i.word.trim() === w.text.replace(/[\.,?!]/g, "").trim()
+                  )?.ipa && (
+                    <div className="text-sm text-foreground/70 font-serif">
+                      {
+                        ipa.find(
+                          (i) =>
+                            i.word.trim() ===
+                            w.text.replace(/[\.,?!]/g, "").trim()
+                        )?.ipa
+                      }
+                    </div>
+                  )}
+              </div>
+            ))}
+          </div>
+          {displayTranslation && translation && (
+            <div className="select-text py-2 text-sm text-foreground/70">
+              {translation}
+            </div>
+          )}
+        </div>
+
+        <DropdownMenu>
+          <DropdownMenuTrigger asChild>
+            <Button variant="ghost" size="icon">
+              <ChevronDownIcon className="w-4 h-4" />
+            </Button>
+          </DropdownMenuTrigger>
+          <DropdownMenuContent>
+            <DropdownMenuItem disabled={translating} onClick={translate}>
+              {translating ? (
+                <LoaderIcon className="w-4 h-4 mr-2 animate-spin" />
+              ) : (
+                <LanguagesIcon className="w-4 h-4 mr-2" />
+              )}
+              <span>{t("translate")}</span>
+            </DropdownMenuItem>
+            <DropdownMenuItem disabled={ipaGenerating} onClick={toogleIPA}>
+              {ipaGenerating ? (
+                <LoaderIcon className="w-4 h-4 mr-2 animate-spin" />
+              ) : (
+                <SpeechIcon className="w-4 h-4 mr-2" />
+              )}
+              <span>{t("displayIpa")}</span>
+            </DropdownMenuItem>
+          </DropdownMenuContent>
+        </DropdownMenu>
       </div>
+
+      <Popover
+        open={Boolean(selected) && !isPlaying}
+        onOpenChange={(value) => {
+          if (!value) setSelected(null);
+        }}
+      >
+        <PopoverAnchor
+          className="absolute w-0 h-0"
+          style={{
+            top: selected?.position?.top,
+            left: selected?.position?.left,
+          }}
+        ></PopoverAnchor>
+        <PopoverContent
+          className="w-full max-w-md p-0"
+          updatePositionStrategy="always"
+        >
+          {selected?.word && (
+            <ResourceCaptionSelectionMenu
+              word={selected.word}
+              context={transcription.segments
+                .map((w) => w.text)
+                .join(" ")
+                .trim()}
+              mediaId={props.mediaId}
+              mediaType={props.mediaType}
+              onPlay={() => {
+                setIsPlaying(true);
+              }}
+            />
+          )}
+        </PopoverContent>
+      </Popover>
     </div>
   );
 };
diff --git a/enjoy/src/renderer/components/medias/media-player.tsx b/enjoy/src/renderer/components/medias/media-player.tsx
index 28ef95cc..689e3a0f 100644
--- a/enjoy/src/renderer/components/medias/media-player.tsx
+++ b/enjoy/src/renderer/components/medias/media-player.tsx
@@ -538,6 +538,7 @@ export const MediaPlayer = (props: {
       {initialized && (
         <div className={recordButtonVisible && mediaProvider ? "" : "hidden"}>
           <MediaCaption
+            key={`${mediaId}-${currentSegmentIndex}`}
             mediaId={mediaId}
             mediaType={mediaType}
             currentTime={currentTime}
diff --git a/enjoy/src/renderer/components/preferences/basic-settings.tsx b/enjoy/src/renderer/components/preferences/basic-settings.tsx
index 671d4de8..5a9aaa4a 100644
--- a/enjoy/src/renderer/components/preferences/basic-settings.tsx
+++ b/enjoy/src/renderer/components/preferences/basic-settings.tsx
@@ -1,3 +1,6 @@
+import * as z from "zod";
+import { useForm } from "react-hook-form";
+import { zodResolver } from "@hookform/resolvers/zod";
 import { t } from "i18next";
 import {
   AlertDialog,
@@ -19,6 +22,12 @@ import {
   DialogHeader,
   DialogDescription,
   DialogFooter,
+  FormField,
+  Form,
+  FormItem,
+  FormLabel,
+  FormControl,
+  FormMessage,
   Input,
   Label,
   Separator,
@@ -29,7 +38,7 @@ import {
   SelectValue,
   SelectContent,
 } from "@renderer/components/ui";
-import { WhisperModelOptions } from "@renderer/components";
+import { WhisperModelOptions, LLM_PROVIDERS } from "@renderer/components";
 import {
   AppSettingsProviderContext,
   AISettingsProviderContext,
@@ -364,62 +373,129 @@ const WhisperSettings = () => {
 const OpenaiSettings = () => {
   const { openai, setOpenai } = useContext(AISettingsProviderContext);
   const [editing, setEditing] = useState(false);
-  const ref = useRef<HTMLInputElement>();
 
-  const handleSave = () => {
-    if (!ref.current) return;
+  const openAiConfigSchema = z.object({
+    key: z.string().optional(),
+    model: z.enum(LLM_PROVIDERS.openai.models),
+    baseUrl: z.string().optional(),
+  });
 
+  const form = useForm<z.infer<typeof openAiConfigSchema>>({
+    resolver: zodResolver(openAiConfigSchema),
+    values: {
+      key: openai?.key,
+      model: openai?.model,
+      baseUrl: openai?.baseUrl,
+    },
+  });
+
+  const onSubmit = async (data: z.infer<typeof openAiConfigSchema>) => {
     setOpenai({
-      key: ref.current.value,
+      ...data,
     });
     setEditing(false);
-
-    toast.success(t("openaiKeySaved"));
+    toast.success(t("openaiConfigSaved"));
   };
 
-  useEffect(() => {
-    if (editing) {
-      ref.current?.focus();
-    }
-  }, [editing]);
-
   return (
-    <div className="flex items-start justify-between py-4">
-      <div className="">
-        <div className="mb-2">Open AI</div>
-        <div className="text-sm text-muted-foreground">
-          <div className="flex items-center space-x-4">
-            <Label>{t("key")}:</Label>
-            <Input
-              ref={ref}
-              type="password"
-              defaultValue={openai?.key}
-              placeholder="sk-*********"
-              disabled={!editing}
-              className="focus-visible:outline-0 focus-visible:ring-0 shadow-none"
-            />
-            {editing && (
-              <Button
-                size="sm"
-                className="min-w-max text-md"
-                onClick={handleSave}
-              >
-                {t("save")}
-              </Button>
-            )}
+    <Form {...form}>
+      <form onSubmit={form.handleSubmit(onSubmit)}>
+        <div className="flex items-start justify-between py-4">
+          <div className="">
+            <div className="mb-2">Open AI</div>
+            <div className="text-sm text-muted-foreground space-y-1">
+              <FormField
+                control={form.control}
+                name="key"
+                render={({ field }) => (
+                  <FormItem>
+                    <div className="flex items-center space-x-2">
+                      <FormLabel>{t("key")}:</FormLabel>
+                      <Input
+                        disabled={!editing}
+                        type="password"
+                        value={field.value}
+                        onChange={field.onChange}
+                      />
+                    </div>
+                    <FormMessage />
+                  </FormItem>
+                )}
+              />
+              <FormField
+                control={form.control}
+                name="model"
+                render={({ field }) => (
+                  <FormItem>
+                    <div className="flex items-center space-x-2">
+                      <FormLabel>{t("model")}:</FormLabel>
+                      <Select
+                        disabled={!editing}
+                        onValueChange={field.onChange}
+                        value={field.value}
+                      >
+                        <FormControl>
+                          <SelectTrigger>
+                            <SelectValue placeholder={t("selectAiModel")} />
+                          </SelectTrigger>
+                        </FormControl>
+                        <SelectContent>
+                          {(LLM_PROVIDERS.openai.models || []).map(
+                            (option: string) => (
+                              <SelectItem key={option} value={option}>
+                                {option}
+                              </SelectItem>
+                            )
+                          )}
+                        </SelectContent>
+                      </Select>
+                    </div>
+                    <FormMessage />
+                  </FormItem>
+                )}
+              />
+              <FormField
+                control={form.control}
+                name="baseUrl"
+                render={({ field }) => (
+                  <FormItem>
+                    <div className="flex items-center space-x-2">
+                      <FormLabel>{t("baseUrl")}:</FormLabel>
+                      <Input
+                        disabled={!editing}
+                        placeholder={t("leaveEmptyToUseDefault")}
+                        defaultValue=""
+                        value={field.value}
+                        onChange={field.onChange}
+                      />
+                    </div>
+                    <FormMessage />
+                  </FormItem>
+                )}
+              />
+            </div>
+          </div>
+
+          <div className="flex items-center space-x-2">
+            <Button
+              variant={editing ? "outline" : "secondary"}
+              size="sm"
+              type="reset"
+              onClick={(event) => {
+                event.preventDefault();
+                form.reset();
+                setEditing(!editing);
+              }}
+            >
+              {editing ? t("cancel") : t("edit")}
+            </Button>
+            <Button className={editing ? "" : "hidden"} size="sm" type="submit">
+              {t("save")}
+            </Button>
           </div>
         </div>
-      </div>
-      <div className="">
-        <Button
-          variant={editing ? "outline" : "secondary"}
-          size="sm"
-          onClick={() => setEditing(!editing)}
-        >
-          {editing ? t("cancel") : t("edit")}
-        </Button>
-      </div>
-    </div>
+      </form>
+    </Form>
   );
 };
 
diff --git a/enjoy/src/renderer/components/stories/index.ts b/enjoy/src/renderer/components/stories/index.ts
index 7bcc4271..34383ecc 100644
--- a/enjoy/src/renderer/components/stories/index.ts
+++ b/enjoy/src/renderer/components/stories/index.ts
@@ -4,6 +4,7 @@ export * from "./story-preview-toolbar";
 export * from "./story-toolbar";
 export * from "./story-viewer";
 export * from "./story-content";
+export * from "./story-vocabulary-sheet";
 
 export * from "./stories-segment";
 export * from "./ted-ideas-segment";
diff --git a/enjoy/src/renderer/components/stories/story-toolbar.tsx b/enjoy/src/renderer/components/stories/story-toolbar.tsx
index b3740b3e..bf4cb475 100644
--- a/enjoy/src/renderer/components/stories/story-toolbar.tsx
+++ b/enjoy/src/renderer/components/stories/story-toolbar.tsx
@@ -1,7 +1,4 @@
 import {
-  Alert,
-  AlertTitle,
-  AlertDescription,
   AlertDialog,
   AlertDialogTrigger,
   AlertDialogContent,
@@ -12,16 +9,9 @@ import {
   AlertDialogCancel,
   AlertDialogAction,
   Button,
-  ScrollArea,
-  Separator,
-  Sheet,
-  SheetHeader,
-  SheetContent,
   FloatingToolbar,
   ToolbarButton,
 } from "@renderer/components/ui";
-import { MeaningCard, NoRecordsFound, LoaderSpin } from "@renderer/components";
-import { useState } from "react";
 import {
   HighlighterIcon,
   ScanTextIcon,
@@ -46,123 +36,65 @@ export const StoryToolbar = (props: {
   meanings?: MeaningType[];
   marked?: boolean;
   toggleMarked?: () => void;
-  pendingLookups?: LookupType[];
   handleShare?: () => void;
+  vocabularyVisible: boolean;
+  setVocabularyVisible?: (value: boolean) => void;
 }) => {
   const {
     starred,
     toggleStarred,
-    extracted,
     scanning,
     onScan,
     marked,
     toggleMarked,
-    meanings = [],
-    pendingLookups = [],
     handleShare,
+    vocabularyVisible,
+    setVocabularyVisible,
   } = props;
 
-  const [vocabularyVisible, setVocabularyVisible] = useState<boolean>(
-    !extracted
-  );
-
   return (
-    <>
-      <FloatingToolbar>
-        <ToolbarButton
-          disabled={scanning}
-          toggled={vocabularyVisible}
-          onClick={() => {
-            onScan();
-            setVocabularyVisible(!vocabularyVisible);
-          }}
-        >
-          {scanning ? (
-            <LoaderIcon className="w-6 h-6 animate-spin" />
-          ) : (
-            <ScanTextIcon className="w-6 h-6" />
-          )}
-        </ToolbarButton>
-        <ToolbarButton toggled={marked} onClick={toggleMarked}>
-          <HighlighterIcon className="w-6 h-6" />
-        </ToolbarButton>
-        <ToolbarButton toggled={starred} onClick={toggleStarred}>
-          <StarIcon className="w-6 h-6" />
-        </ToolbarButton>
-        <AlertDialog>
-          <AlertDialogTrigger asChild>
-            <ToolbarButton toggled={false} onClick={toggleStarred}>
-              <Share2Icon className="w-6 h-6" />
-            </ToolbarButton>
-          </AlertDialogTrigger>
-          <AlertDialogContent>
-            <AlertDialogHeader>
-              <AlertDialogTitle>{t("shareStory")}</AlertDialogTitle>
-              <AlertDialogDescription>
-                {t("areYouSureToShareThisStoryToCommunity")}
-              </AlertDialogDescription>
-            </AlertDialogHeader>
-            <AlertDialogFooter>
-              <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
-              <AlertDialogAction>
-                <Button onClick={handleShare}>{t("share")}</Button>
-              </AlertDialogAction>
-            </AlertDialogFooter>
-          </AlertDialogContent>
-        </AlertDialog>
-      </FloatingToolbar>
-
-      <Sheet
-        open={!!vocabularyVisible}
-        onOpenChange={(value) => {
-          if (!value) setVocabularyVisible(null);
+    <FloatingToolbar>
+      <ToolbarButton
+        disabled={scanning}
+        toggled={vocabularyVisible}
+        onClick={() => {
+          onScan();
+          setVocabularyVisible(!vocabularyVisible);
         }}
       >
-        <SheetContent side="bottom" className="rounded-t-2xl shadow-lg h-5/6">
-          <SheetHeader className="flex items-center justify-center mb-2">
-            <div className="text-center">
-              <span className="font-semibold text-xl capitalize">
-                {t("keyVocabulary")}
-              </span>
-              <span className="ml-2 text-sm text-muted-foreground">
-                ({meanings.length})
-              </span>
-            </div>
-          </SheetHeader>
-          <div className="w-full max-w-prose mx-auto h-full overflow-hidden px-4">
-            <ScrollArea className="h-full pb-12">
-              {extracted ? (
-                <>
-                  {pendingLookups.length > 0 && (
-                    <Alert className="mb-4">
-                      <LoaderIcon className="w-5 h-5 text-muted-foreground animate-spin" />
-                      <AlertTitle>{t("lookingUp")}</AlertTitle>
-                      <AlertDescription>
-                        {t("thereAreLookupsPending", {
-                          count: pendingLookups.length,
-                        })}
-                      </AlertDescription>
-                    </Alert>
-                  )}
-
-                  {meanings.length > 0 ? (
-                    meanings.map((meaning) => (
-                      <div key={meaning.id} className="">
-                        <MeaningCard meaning={meaning} />
-                        <Separator className="my-4" />
-                      </div>
-                    ))
-                  ) : (
-                    <NoRecordsFound />
-                  )}
-                </>
-              ) : (
-                <LoaderSpin />
-              )}
-            </ScrollArea>
-          </div>
-        </SheetContent>
-      </Sheet>
-    </>
+        {scanning ? (
+          <LoaderIcon className="w-6 h-6 animate-spin" />
+        ) : (
+          <ScanTextIcon className="w-6 h-6" />
+        )}
+      </ToolbarButton>
+      <ToolbarButton toggled={marked} onClick={toggleMarked}>
+        <HighlighterIcon className="w-6 h-6" />
+      </ToolbarButton>
+      <ToolbarButton toggled={starred} onClick={toggleStarred}>
+        <StarIcon className="w-6 h-6" />
+      </ToolbarButton>
+      <AlertDialog>
+        <AlertDialogTrigger asChild>
+          <ToolbarButton toggled={false} onClick={toggleStarred}>
+            <Share2Icon className="w-6 h-6" />
+          </ToolbarButton>
+        </AlertDialogTrigger>
+        <AlertDialogContent>
+          <AlertDialogHeader>
+            <AlertDialogTitle>{t("shareStory")}</AlertDialogTitle>
+            <AlertDialogDescription>
+              {t("areYouSureToShareThisStoryToCommunity")}
+            </AlertDialogDescription>
+          </AlertDialogHeader>
+          <AlertDialogFooter>
+            <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
+            <AlertDialogAction>
+              <Button onClick={handleShare}>{t("share")}</Button>
+            </AlertDialogAction>
+          </AlertDialogFooter>
+        </AlertDialogContent>
+      </AlertDialog>
+    </FloatingToolbar>
   );
 };
diff --git a/enjoy/src/renderer/components/stories/story-viewer.tsx b/enjoy/src/renderer/components/stories/story-viewer.tsx
index 9f30eea3..022fde66 100644
--- a/enjoy/src/renderer/components/stories/story-viewer.tsx
+++ b/enjoy/src/renderer/components/stories/story-viewer.tsx
@@ -9,7 +9,7 @@ import {
   PopoverAnchor,
 } from "@renderer/components/ui";
 import { SelectionMenu } from "@renderer/components";
-import { debounce , uniq } from "lodash";
+import { debounce, uniq } from "lodash";
 import Mark from "mark.js";
 
 export const StoryViewer = (props: {
@@ -17,7 +17,7 @@ export const StoryViewer = (props: {
   marked?: boolean;
   meanings?: MeaningType[];
   setMeanings: (meanings: MeaningType[]) => void;
-  pendingLookups?: LookupType[];
+  pendingLookups?: Partial<LookupType>[];
   doc: any;
 }) => {
   const navigate = useNavigate();
@@ -48,6 +48,8 @@ export const StoryViewer = (props: {
 
   const handleSelectionChanged = debounce(() => {
     const selection = document.getSelection();
+    if (!ref.current?.contains(selection.anchorNode.parentElement)) return;
+
     const word = selection
       .toString()
       .trim()
@@ -73,17 +75,16 @@ export const StoryViewer = (props: {
     return () => {
       document.removeEventListener("selectionchange", handleSelectionChanged);
     };
-  }, [story]);
+  }, [story, ref]);
 
   useEffect(() => {
-    const words = uniq([
-      ...meanings.map((m) => m.word),
-      ...pendingLookups.map((l) => l.word),
-    ]);
-    if (words.length === 0) return;
-
     const marker = new Mark(ref.current);
     if (marked) {
+      const words = uniq([
+        ...meanings.map((m) => m.word),
+        ...pendingLookups.map((l) => l.word),
+      ]);
+      if (words.length === 0) return;
       marker.mark(words, {
         separateWordSearch: false,
         caseSensitive: false,
@@ -92,6 +93,10 @@ export const StoryViewer = (props: {
     } else {
       marker.unmark();
     }
+
+    return () => {
+      marker.unmark();
+    };
   }, [meanings, pendingLookups, marked]);
 
   return (
diff --git a/enjoy/src/renderer/components/stories/story-vocabulary-sheet.tsx b/enjoy/src/renderer/components/stories/story-vocabulary-sheet.tsx
new file mode 100644
index 00000000..f204966e
--- /dev/null
+++ b/enjoy/src/renderer/components/stories/story-vocabulary-sheet.tsx
@@ -0,0 +1,155 @@
+import {
+  Alert,
+  AlertTitle,
+  AlertDescription,
+  Button,
+  ScrollArea,
+  Separator,
+  Sheet,
+  SheetHeader,
+  SheetContent,
+} from "@renderer/components/ui";
+import { MeaningCard, NoRecordsFound, LoaderSpin } from "@renderer/components";
+import { LoaderIcon, LanguagesIcon } from "lucide-react";
+import { t } from "i18next";
+
+export const StoryVocabularySheet = (props: {
+  extracted: boolean;
+  meanings?: MeaningType[];
+  pendingLookups?: Partial<LookupType>[];
+  vocabularyVisible?: boolean;
+  setVocabularyVisible?: (value: boolean) => void;
+  lookingUpInBatch?: boolean;
+  setLookupInBatch?: (value: boolean) => void;
+  processLookup?: (lookup: Partial<LookupType>) => void;
+  lookingUp?: boolean;
+}) => {
+  const {
+    extracted,
+    meanings = [],
+    pendingLookups = [],
+    vocabularyVisible,
+    setVocabularyVisible,
+    lookingUpInBatch,
+    setLookupInBatch,
+    processLookup,
+    lookingUp,
+  } = props;
+
+  return (
+    <Sheet
+      open={!!vocabularyVisible}
+      onOpenChange={(value) => {
+        if (!value) setVocabularyVisible(null);
+      }}
+    >
+      <SheetContent side="bottom" className="rounded-t-2xl shadow-lg h-5/6">
+        <SheetHeader className="flex items-center justify-center mb-2">
+          <div className="text-center">
+            <span className="font-semibold text-xl capitalize">
+              {t("keyVocabulary")}
+            </span>
+            <span className="ml-2 text-sm text-muted-foreground">
+              ({meanings.length})
+            </span>
+          </div>
+        </SheetHeader>
+        <div className="w-full max-w-prose mx-auto h-full overflow-hidden px-4">
+          <ScrollArea className="h-full px-4 pb-12">
+            {extracted ? (
+              <>
+                {pendingLookups.length > 0 && (
+                  <Alert className="mb-4">
+                    {lookingUpInBatch ? (
+                      <>
+                        <LoaderIcon className="w-5 h-5 text-muted-foreground animate-spin" />
+                        <AlertTitle>{t("lookingUp")}</AlertTitle>
+                        <AlertDescription className="flex items-start">
+                          <div className="flex-1">
+                            {t("thereAreLookupsPending", {
+                              count: pendingLookups.length,
+                            })}
+                          </div>
+                          <div className="">
+                            <Button
+                              variant="secondary"
+                              onClick={() => setLookupInBatch(false)}
+                              size="sm"
+                            >
+                              {t("cancel")}
+                            </Button>
+                          </div>
+                        </AlertDescription>
+                      </>
+                    ) : (
+                      <>
+                        <LanguagesIcon className="w-5 h-5" />
+                        <AlertTitle>{t("pending")}</AlertTitle>
+                        <AlertDescription className="flex items-start">
+                          <div className="flex-1">
+                            {t("thereAreLookupsPending", {
+                              count: pendingLookups.length,
+                            })}
+                          </div>
+                          <div className="">
+                            <Button
+                              variant="outline"
+                              onClick={() => setLookupInBatch(true)}
+                              size="sm"
+                            >
+                              {t("lookUpAll")}
+                            </Button>
+                          </div>
+                        </AlertDescription>
+                      </>
+                    )}
+                  </Alert>
+                )}
+
+                {meanings.length > 0 &&
+                  meanings.map((meaning) => (
+                    <div key={meaning.id} className="">
+                      <MeaningCard meaning={meaning} />
+                      <Separator className="my-4" />
+                    </div>
+                  ))}
+
+                {pendingLookups.length > 0 &&
+                  pendingLookups.map((lookup) => (
+                    <div key={lookup.id} className="">
+                      <div className="flex items-center justify-between">
+                        <div className="font-bold mb-2">{lookup.word}</div>
+                        <Button
+                          disabled={lookingUp}
+                          onClick={() => processLookup(lookup)}
+                          variant="secondary"
+                          size="sm"
+                        >
+                          {t("lookUp")}
+                        </Button>
+                      </div>
+                      <div className="text-sm mb-2">
+                        <div className="uppercase font-semibold my-2">
+                          {t("context")}:
+                        </div>
+                        <div className="mb-2 text-muted-foreground">
+                          {lookup.context}
+                        </div>
+                      </div>
+                      <Separator className="my-4" />
+                    </div>
+                  ))}
+
+                {meanings.length === 0 && pendingLookups.length === 0 && (
+                  <NoRecordsFound />
+                )}
+              </>
+            ) : (
+              <LoaderSpin />
+            )}
+          </ScrollArea>
+        </div>
+      </SheetContent>
+    </Sheet>
+  );
+};
diff --git a/enjoy/src/renderer/pages/story-preview.tsx b/enjoy/src/renderer/pages/story-preview.tsx
index 75d824ca..86d809af 100644
--- a/enjoy/src/renderer/pages/story-preview.tsx
+++ b/enjoy/src/renderer/pages/story-preview.tsx
@@ -53,8 +53,8 @@ export default () => {
 
     webApi
       .createStory({
-        url: story.metadata?.url || story.url,
         ...story,
+        url: story.metadata?.url || story.url,
       } as CreateStoryParamsType)
       .then((story) => {
         navigate(`/stories/${story.id}`);
@@ -168,7 +168,7 @@ export default () => {
 
   useEffect(() => {
     if (readable) {
-      EnjoyApp.view.hide();
+      EnjoyApp.view.hide().catch(console.error);
     } else if (!loading) {
       const rect = containerRef.current.getBoundingClientRect();
       EnjoyApp.view.show({
diff --git a/enjoy/src/renderer/pages/story.tsx b/enjoy/src/renderer/pages/story.tsx
index b32c3011..742a069a 100644
--- a/enjoy/src/renderer/pages/story.tsx
+++ b/enjoy/src/renderer/pages/story.tsx
@@ -5,31 +5,42 @@ import {
   PagePlaceholder,
   StoryToolbar,
   StoryViewer,
+  StoryVocabularySheet,
 } from "@renderer/components";
 import { useState, useContext, useEffect } from "react";
 import { useParams } from "react-router-dom";
-import { AppSettingsProviderContext } from "@renderer/context";
+import {
+  AppSettingsProviderContext,
+  AISettingsProviderContext,
+} from "@renderer/context";
+import { extractStoryCommand, lookupCommand } from "@/commands";
 import nlp from "compromise";
 import paragraphs from "compromise-paragraphs";
 nlp.plugin(paragraphs);
 
-let timeout: NodeJS.Timeout = null;
 export default () => {
   const { id } = useParams<{ id: string }>();
   const { webApi } = useContext(AppSettingsProviderContext);
+  const { openai } = useContext(AISettingsProviderContext);
   const [loading, setLoading] = useState<boolean>(true);
   const [story, setStory] = useState<StoryType>();
   const [meanings, setMeanings] = useState<MeaningType[]>([]);
-  const [pendingLookups, setPendingLookups] = useState<LookupType[]>([]);
-  const [scanning, setScanning] = useState<boolean>(false);
+  const [pendingLookups, setPendingLookups] = useState<Partial<LookupType>[]>(
+    []
+  );
+  const [scanning, setScanning] = useState<boolean>(true);
   const [marked, setMarked] = useState<boolean>(true);
   const [doc, setDoc] = useState<any>(null);
+  const [vocabularyVisible, setVocabularyVisible] = useState<boolean>(false);
+  const [lookingUpInBatch, setLookupInBatch] = useState<boolean>(false);
+  const [lookingUp, setLookingUp] = useState<boolean>(false);
 
   const fetchStory = async () => {
     webApi
       .story(id)
       .then((story) => {
         setStory(story);
+        setVocabularyVisible(!story.extracted);
         const doc = nlp(story.content);
         doc.cache();
         setDoc(doc);
@@ -47,28 +58,75 @@ export default () => {
         if (!response) return;
 
         setMeanings(response.meanings);
-        setPendingLookups(response.pendingLookups);
-
-        if (response.pendingLookups.length > 0) {
-          if (timeout) clearTimeout(timeout);
-
-          timeout = setTimeout(() => {
-            fetchMeanings();
-          }, 3000);
-        }
+        setPendingLookups(response.pendingLookups || []);
       })
       .finally(() => {
         setScanning(false);
       });
   };
 
-  const lookupVocabulary = () => {
-    if (story?.extracted) return;
+  const extractVocabulary = async () => {
+    if (!story) return;
+
+    let { words = [], idioms = [] } = story?.extraction || {};
+    if (story?.extracted && (words.length > 0 || idioms.length > 0)) return;
+
+    toast.promise(
+      async () => {
+        if (words.length === 0 && idioms.length === 0) {
+          if (!openai?.key) {
+            toast.error(t("openaiKeyRequired"));
+            return;
+          }
+
+          try {
+            const res = await extractStoryCommand(story.content, {
+              key: openai.key,
+            });
+
+            words = res.words || [];
+            idioms = res.idioms || [];
+          } catch (error) {
+            console.error(error);
+            toast.error(t("extractionFailed"), {
+              description: error.message,
+            });
+            return;
+          }
+        }
+
+        webApi
+          .extractVocabularyFromStory(id, {
+            words,
+            idioms,
+          })
+          .then(() => {
+            fetchStory();
+          })
+          .finally(() => {
+            setScanning(false);
+          });
+      },
+      {
+        loading: t("extracting"),
+        success: t("extractedSuccessfully"),
+        error: (err) => t("extractionFailed", { error: err.message }),
+        position: "bottom-right",
+      }
+    );
+  };
+
+  const buildVocabulary = () => {
+    if (!story?.extraction) return;
+    if (meanings.length > 0 || pendingLookups.length > 0) return;
     if (!doc) return;
+    if (scanning) return;
 
-    const vocabulary: any[] = [];
+    const { words = [], idioms = [] } = story.extraction || {};
 
-    story.vocabulary.forEach((word) => {
+    const lookups: any[] = [];
+
+    [...words, ...idioms].forEach((word) => {
       const m = doc.lookup(word);
 
       const sentences = m.sentences().json();
@@ -79,7 +137,7 @@ export default () => {
           return;
         }
 
-        vocabulary.push({
+        lookups.push({
           word,
           context,
           sourceId: story.id,
@@ -88,19 +146,24 @@ export default () => {
       });
     });
 
-    webApi.lookupInBatch(vocabulary).then((response) => {
-      const { errors } = response;
-      if (errors.length > 0) {
-        console.warn(errors);
-        return;
-      }
+    const pendings = lookups
+      .filter(
+        (v) =>
+          meanings.findIndex(
+            (m) => m.word.toLowerCase() === v.word.toLowerCase()
+          ) < 0
+      )
+      .filter(
+        (v) =>
+          pendingLookups.findIndex(
+            (l) => l.word.toLowerCase() === v.word.toLowerCase()
+          ) < 0
+      );
 
-      webApi.extractVocabularyFromStory(id).then(() => {
-        fetchStory();
-        if (pendingLookups.length > 0) return;
+    if (pendings.length === 0) return;
 
-        fetchMeanings();
-      });
+    webApi.lookupInBatch(pendings).then(() => {
+      fetchMeanings();
     });
   };
 
@@ -131,18 +194,76 @@ export default () => {
       });
   };
 
+  const processLookup = async (pendingLookup: Partial<LookupType>) => {
+    if (lookingUp) return;
+
+    const { meaningOptions = [] } = await webApi.lookup({
+      word: pendingLookup.word,
+      context: pendingLookup.context,
+      sourceId: story.id,
+      sourceType: "Story",
+    });
+    if (!openai?.key) {
+      toast.error(t("openaiApiKeyRequired"));
+      return;
+    }
+
+    setLookingUp(true);
+    toast.promise(
+      lookupCommand(
+        {
+          word: pendingLookup.word,
+          context: pendingLookup.context,
+          meaningOptions,
+        },
+        {
+          key: openai.key,
+        }
+      )
+        .then((res) => {
+          if (res.context_translation?.trim()) {
+            webApi
+              .updateLookup(pendingLookup.id, {
+                meaning: res,
+                sourceId: story.id,
+                sourceType: "Story",
+              })
+              .then(() => {
+                fetchMeanings();
+              });
+          }
+        })
+        .finally(() => {
+          setLookingUp(false);
+        }),
+      {
+        loading: t("lookingUp"),
+        success: t("lookedUpSuccessfully"),
+        error: (err) => t("lookupFailed", { error: err.message }),
+        position: "bottom-right",
+      }
+    );
+  };
+
   useEffect(() => {
     fetchStory();
     fetchMeanings();
-
-    return () => {
-      if (timeout) clearTimeout(timeout);
-    };
   }, [id]);
 
   useEffect(() => {
-    lookupVocabulary();
-  }, [story]);
+    extractVocabulary();
+  }, [story?.extracted]);
+
+  useEffect(() => {
+    buildVocabulary();
+  }, [pendingLookups, meanings, story?.extraction]);
+
+  useEffect(() => {
+    if (!lookingUpInBatch) return;
+    if (pendingLookups.length === 0) return;
+
+    processLookup(pendingLookups[0]);
+  }, [pendingLookups, lookingUpInBatch]);
 
   if (loading) {
     return (
@@ -174,19 +295,31 @@ export default () => {
           extracted={story.extracted}
           starred={story.starred}
           toggleStarred={toggleStarred}
-          pendingLookups={pendingLookups}
           handleShare={handleShare}
+          vocabularyVisible={vocabularyVisible}
+          setVocabularyVisible={setVocabularyVisible}
         />
 
         <StoryViewer
           story={story}
           marked={marked}
-          meanings={meanings}
           pendingLookups={pendingLookups}
+          meanings={meanings}
           setMeanings={setMeanings}
           doc={doc}
         />
       </ScrollArea>
+      <StoryVocabularySheet
+        pendingLookups={pendingLookups}
+        extracted={story.extracted}
+        meanings={meanings}
+        vocabularyVisible={vocabularyVisible}
+        setVocabularyVisible={setVocabularyVisible}
+        lookingUpInBatch={lookingUpInBatch}
+        setLookupInBatch={setLookupInBatch}
+        processLookup={processLookup}
+        lookingUp={lookingUp}
+      />
     </>
   );
 };
diff --git a/enjoy/src/types.d.ts b/enjoy/src/types.d.ts
index 68d5219e..8e1241ce 100644
--- a/enjoy/src/types.d.ts
+++ b/enjoy/src/types.d.ts
@@ -10,6 +10,7 @@ type SupportedLlmProviderType = "openai" | "googleGenerativeAi";
 type LlmProviderType = {
   key?: string;
   model?: string;
+  baseUrl?: string;
 };
 
 type DownloadStateType = {
@@ -92,6 +93,7 @@ type LookupType = {
   contextTranslation: string;
   status?: "pending" | "completed" | "failed";
   meaning?: MeaningType;
+  meaningOptions?: MeaningType[];
   createdAt: string;
   updatedAt: string;
 };
diff --git a/enjoy/src/types/story.d.ts b/enjoy/src/types/story.d.ts
index f3c63e02..612b4c45 100644
--- a/enjoy/src/types/story.d.ts
+++ b/enjoy/src/types/story.d.ts
@@ -6,6 +6,10 @@ type StoryType = {
   metadata: {
     [key: string]: string;
   };
+  extraction?: {
+    words?: string[];
+    idioms?: string[];
+  };
   vocabulary?: string[];
   extracted?: boolean;
   starred?: boolean;
@@ -21,4 +25,8 @@ type CreateStoryParamsType = {
   metadata: {
     [key: string]: string;
   };
+  extraction?: {
+    words?: string[];
+    idioms?: string[];
+  }
 };
diff --git a/enjoy/tsconfig.json b/enjoy/tsconfig.json
index 37860d29..a10922e4 100644
--- a/enjoy/tsconfig.json
+++ b/enjoy/tsconfig.json
@@ -15,7 +15,8 @@
     "paths": {
       "@/*": ["./src/*"],
       "@renderer/*": ["./src/renderer/*"],
-      "@main/*": ["./src/main/*"]
+      "@main/*": ["./src/main/*"],
+      "@commands": ["./src/commands"]
     },
     "emitDecoratorMetadata": true,
     "experimentalDecorators": true,
diff --git a/enjoy/vite.main.config.mts b/enjoy/vite.main.config.mts
index f6eeff09..cbc19a39 100644
--- a/enjoy/vite.main.config.mts
+++ b/enjoy/vite.main.config.mts
@@ -12,6 +12,7 @@ export default defineConfig({
     alias: {
       "@": path.resolve(__dirname, "./src"),
       "@main": path.resolve(__dirname, "./src/main"),
+      "@commands": path.resolve(__dirname, "./src/commands"),
     },
   },
   build: {
diff --git a/enjoy/vite.renderer.config.mts b/enjoy/vite.renderer.config.mts
index 7651b8c6..8d7d8644 100644
--- a/enjoy/vite.renderer.config.mts
+++ b/enjoy/vite.renderer.config.mts
@@ -20,6 +20,7 @@ export default defineConfig({
     alias: {
       "@": path.resolve(__dirname, "./src"),
       "@renderer": path.resolve(__dirname, "./src/renderer"),
+      "@commands": path.resolve(__dirname, "./src/commands"),
     },
   },
 });
diff --git a/yarn.lock b/yarn.lock
index 18b8df33..9eb55869 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -5803,6 +5803,7 @@ __metadata:
     fs-extra: "npm:^11.2.0"
     html-to-text: "npm:^9.0.5"
     i18next: "npm:^23.7.16"
+    js-md5: "npm:^0.8.3"
     langchain: "npm:^0.1.4"
     lodash: "npm:^4.17.21"
     lucide-react: "npm:^0.312.0"
@@ -7955,6 +7956,13 @@ __metadata:
   languageName: node
   linkType: hard
 
+"js-md5@npm:^0.8.3":
+  version: 0.8.3
+  resolution: "js-md5@npm:0.8.3"
+  checksum: f7e41e95f8e5eb5eeb43085bec3832ae3dfe0020c42fcca5a4efe571213391a9e9594db31bd34624b7280af4f1f12c751b6a50074a15346ecf40a0d54115d77f
+  languageName: node
+  linkType: hard
+
 "js-tiktoken@npm:^1.0.7, js-tiktoken@npm:^1.0.8":
   version: 1.0.8
   resolution: "js-tiktoken@npm:1.0.8"