Update constants from api (#607)

* fix caption ipa display * fetch gpt/tts providers from API * fetch remote gpt presets * update constants * fix conversavtion save * refactor ipa convert * fetch ipa mapping from api * fix ipa mark * fix constant * validate camdict pron audio src
2024-05-14 20:37:51 +08:00
parent e5f682c6c5
commit 49dabc89a3
13 changed files with 379 additions and 198 deletions
--- a/enjoy/src/api/client.ts
+++ b/enjoy/src/api/client.ts
@@ -80,8 +80,8 @@ export class Client {
    return this.api.post("/api/sessions", decamelizeKeys(params));
  }

-  info(): Promise<any> {
-    return this.api.get("/api/info");
+  config(key: string): Promise<any> {
+    return this.api.get(`/api/config/${key}`);
  }

  deviceCode(provider = "github"): Promise<{
--- a/enjoy/src/commands/json.command.ts
+++ b/enjoy/src/commands/json.command.ts
@@ -13,10 +13,10 @@ export const jsonCommand = async (
  }
 ): Promise<any> => {
  const { key, temperature = 0, baseUrl, schema } = options;
-  let { modelName = "gpt-4-turbo" } = options;
+  let { modelName = "gpt-4o" } = options;

  if (RESPONSE_JSON_FORMAT_MODELS.indexOf(modelName) === -1) {
-    modelName = "gpt-4-turbo";
+    modelName = "gpt-4o";
  }

  const chatModel = new ChatOpenAI({
--- a/enjoy/src/commands/text.command.ts
+++ b/enjoy/src/commands/text.command.ts
@@ -11,7 +11,7 @@ export const textCommand = async (
  }
 ): Promise<string> => {
  const { key, temperature = 0, baseUrl } = options;
-  let { modelName = "gpt-4-turbo" } = options;
+  let { modelName = "gpt-4o" } = options;

  const chatModel = new ChatOpenAI({
    openAIApiKey: key,
--- a/enjoy/src/constants.ts
+++ b/enjoy/src/constants.ts
@@ -129,6 +129,7 @@ export const RESPONSE_JSON_FORMAT_MODELS = [
  "gpt-3.5-turbo-0125",
  "gpt-3.5-turbo",
  "gpt-3.5-turbo-1106",
+  "gpt-4o",
  "gpt-4-turbo",
  "gpt-4-turbo-2024-04-09",
  "gpt-4-0125-preview",
@@ -140,10 +141,10 @@ export const CONVERSATION_PRESETS = [
  {
    key: "english-coach",
    name: "英语教练",
-    engine: "openai",
+    engine: "enjoyai",
    configuration: {
      type: "gpt",
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
      baseUrl: "",
      roleDefinition: `你是我的英语教练。
 请将我的话改写成英文。
@@ -160,7 +161,7 @@ export const CONVERSATION_PRESETS = [
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: "openai",
+        engine: "enjoyai",
        model: "tts-1",
        voice: "alloy",
      },
@@ -169,10 +170,10 @@ export const CONVERSATION_PRESETS = [
  {
    key: "ny-speak-easy",
    name: "NY Speak Easy",
-    engine: "openai",
+    engine: "enjoyai",
    configuration: {
      type: "gpt",
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
      baseUrl: "",
      roleDefinition: `Your role is to serves as an English spoken adviser, specializing in translating the user's words into everyday spoken English with a New York twist, focusing on common phrasal verbs and idioms. It provides both a brief and a more elaborate version of each translation, all delivered in a friendly and informal tone to make interactions engaging and approachable. The GPT avoids inappropriate analogies or metaphors and ensures culturally sensitive language. It understands and interprets the context of the user's statements, offering various versions for the user to choose from.`,
      temperature: 0.2,
@@ -183,7 +184,7 @@ export const CONVERSATION_PRESETS = [
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: "openai",
+        engine: "enjoyai",
        model: "tts-1",
        voice: "alloy",
      },
@@ -192,10 +193,10 @@ export const CONVERSATION_PRESETS = [
  {
    key: "translation-hands",
    name: "Translation Hands",
-    engine: "openai",
+    engine: "enjoyai",
    configuration: {
      type: "gpt",
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
      baseUrl: "",
      roleDefinition: `Your role is to be an English guru, an expert in authentic American English, who assists users in expressing their thoughts clearly and fluently. You are not just translating words; you are delving into the essence of the user's message and reconstructing it in a way that maintains logical clarity and coherence. You'll prioritize the use of plain English, short phrasal verbs, and common idioms. It's important to craft sentences with varied lengths to create a natural rhythm and flow, making the language sound smooth and engaging. Avoid regional expressions or idioms that are too unique or restricted to specific areas. Your goal is to make American English accessible and appealing to a broad audience, helping users communicate effectively in a style that resonates with a wide range of English speakers.`,
      temperature: 0.2,
@@ -206,7 +207,7 @@ export const CONVERSATION_PRESETS = [
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: "openai",
+        engine: "enjoyai",
        model: "tts-1",
        voice: "alloy",
      },
@@ -215,10 +216,10 @@ export const CONVERSATION_PRESETS = [
  {
    key: "metaphor-pro",
    name: "Metaphor Pro",
-    engine: "openai",
+    engine: "enjoyai",
    configuration: {
      type: "gpt",
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
      baseUrl: "",
      roleDefinition: `Your primary role is to act as a 'Metaphor Guru.' It will specialize in analyzing content in various languages, identifying metaphors that might not be easily understood in English culture, and then providing suitable alternatives and explanations in English. This GPT should be adept at language translation and cultural interpretation, ensuring accurate and contextually appropriate metaphor translations. It should be careful to maintain the original sentiment and meaning of the metaphors while adapting them for an English-speaking audience. The GPT should ask for clarification if the provided content is too vague or lacks context. In terms of personalization, it should maintain a helpful and informative demeanor, focusing on delivering clear and concise explanations.`,
      temperature: 0.2,
@@ -229,7 +230,7 @@ export const CONVERSATION_PRESETS = [
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: "openai",
+        engine: "enjoyai",
        model: "tts-1",
        voice: "alloy",
      },
@@ -238,10 +239,10 @@ export const CONVERSATION_PRESETS = [
  {
    key: "style-guru",
    name: "Style Guru",
-    engine: "openai",
+    engine: "enjoyai",
    configuration: {
      type: "gpt",
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
      baseUrl: "",
      roleDefinition: `Your primary role is to act as an English language guru, analyzing content provided by the user and offering detailed, formal suggestions to improve it, based on Joseph M. Williams' book, "Style: Toward Clarity and Grace." When users provide text, analyze it thoroughly for style, structure, and clarity, offering specific and detailed advice. Your feedback should be comprehensive and formal, providing in-depth explanations for each suggestion. Maintain a formal and academic tone in your interactions. If the meaning of a user's text is unclear, ask for clarification to ensure the advice provided is as accurate and helpful as possible. Treat each interaction independently, without referencing past interactions or writing styles, focusing solely on the text presented at the moment.`,
      temperature: 0.2,
@@ -252,7 +253,7 @@ export const CONVERSATION_PRESETS = [
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: "openai",
+        engine: "enjoyai",
        model: "tts-1",
        voice: "alloy",
      },
@@ -261,10 +262,10 @@ export const CONVERSATION_PRESETS = [
  {
    key: "story-scout",
    name: "Story Scout",
-    engine: "openai",
+    engine: "enjoyai",
    configuration: {
      type: "gpt",
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
      baseUrl: "",
      roleDefinition: `You are a Story Searcher GPT, adept at searching through a vast knowledge base to find true stories that suit the user's content needs. Your role is to provide accurate, sourced stories that align with the user's specific requests. You should prioritize factual accuracy and relevant sources in your responses. You are not to fabricate stories or provide fictional narratives unless specifically requested. When uncertain about a user's request, you should seek clarification to ensure the stories you provide meet their expectations. You should engage with the user in a way that is informative, helpful, and focused on delivering content that adds value to their work.`,
      temperature: 0.2,
@@ -275,7 +276,7 @@ export const CONVERSATION_PRESETS = [
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: "openai",
+        engine: "enjoyai",
        model: "tts-1",
        voice: "alloy",
      },
@@ -284,10 +285,10 @@ export const CONVERSATION_PRESETS = [
  {
    key: "research-aid",
    name: "Research Aid",
-    engine: "openai",
+    engine: "enjoyai",
    configuration: {
      type: "gpt",
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
      baseUrl: "",
      roleDefinition: `Your role is to act as a research aid, specifically designed to help users find the most interesting and recent scientific papers related to their topics of interest. You should provide DOI links to these papers for easy access. When a user presents a topic, you'll use your research abilities to find relevant, up-to-date scientific literature, focusing on providing accurate and helpful information. It's important to ensure that the information is recent and from credible scientific sources. If clarification is needed on the user's topic, you should ask for more details to refine the search. Your responses should be tailored to each user's inquiry, ensuring they are relevant and specific to the topic provided.`,
      temperature: 0.2,
@@ -298,7 +299,7 @@ export const CONVERSATION_PRESETS = [
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: "openai",
+        engine: "enjoyai",
        model: "tts-1",
        voice: "alloy",
      },
@@ -307,10 +308,10 @@ export const CONVERSATION_PRESETS = [
  {
    key: "rhyme-master",
    name: "Rhyme Master",
-    engine: "openai",
+    engine: "enjoyai",
    configuration: {
      type: "gpt",
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
      baseUrl: "",
      roleDefinition: `Your role is to act as an English language guru, specializing in helping users craft rhyming sentences or phrases. You'll analyze the content provided by the user and suggest adjacent sentences or phrases that rhyme, adding a creative twist to their speech. Your goal is to enhance the user's speech or writing with rhythmic and rhyming elements, making it more engaging and stylish. You should prioritize understanding the context and maintaining the original message's integrity while introducing rhymes. If a user's input is unclear or lacks sufficient context for rhyming, you may politely ask for clarification. However, your primary approach should be to confidently create rhymes based on the given information, using your expertise in the English language. You should maintain a friendly and supportive tone, encouraging users in their creative writing endeavors.`,
      temperature: 0.2,
@@ -321,7 +322,7 @@ export const CONVERSATION_PRESETS = [
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: "openai",
+        engine: "enjoyai",
        model: "tts-1",
        voice: "alloy",
      },
@@ -330,10 +331,10 @@ export const CONVERSATION_PRESETS = [
  {
    key: "quote-finder",
    name: "Quote Finder",
-    engine: "openai",
+    engine: "enjoyai",
    configuration: {
      type: "gpt",
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
      baseUrl: "",
      roleDefinition: `Your role is to assist users in finding famous quotations from English history, books, or literature that relate to their provided content or input. You should focus on understanding the user's request, identifying relevant themes or keywords, and then sourcing appropriate quotations from a wide range of historical and literary sources. You are expected to provide accurate and contextually relevant quotes, ensuring they align with the user's request. You should avoid providing incorrect or irrelevant quotations, and maintain a respectful and informative tone throughout the interaction. In cases where the request is unclear, you should seek clarification to better understand and fulfill the user's needs. Your responses should be personalized to each user's request, demonstrating an understanding of their specific inquiry and providing tailored quotations that best match their input.`,
      temperature: 0.2,
@@ -344,7 +345,7 @@ export const CONVERSATION_PRESETS = [
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: "openai",
+        engine: "enjoyai",
        model: "tts-1",
        voice: "alloy",
      },
@@ -353,10 +354,10 @@ export const CONVERSATION_PRESETS = [
  {
    key: "analogy-finder",
    name: "Analogy Finder",
-    engine: "openai",
+    engine: "enjoyai",
    configuration: {
      type: "gpt",
-      model: "gpt-4-turbo",
+      model: "gpt-4o",
      baseUrl: "",
      roleDefinition: `Your role is to be a language guru, specializing in providing analogies. When a user provides words, phrases, or passages, you'll search your extensive knowledge base to offer several fitting analogies to enhance their expression. It's important to focus on relevance and creativity in your analogies to ensure they truly enrich the user's language. Avoid providing generic or unrelated analogies. If a passage is unclear or too broad, ask for clarification to ensure the analogies are as fitting as possible.`,
      temperature: 0.2,
@@ -367,7 +368,7 @@ export const CONVERSATION_PRESETS = [
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: "openai",
+        engine: "enjoyai",
        model: "tts-1",
        voice: "alloy",
      },
@@ -375,7 +376,91 @@ export const CONVERSATION_PRESETS = [
  },
 ];

-export const IPA_MAPPING: { [key: string]: string } = {
+export const IPA_CONSONANTS: { [key: string]: string[] } = {
+  plosive: [
+    "p",
+    "b",
+    "t",
+    "d",
+    "ʈ",
+    "ɖ",
+    "c",
+    "ɟ",
+    "k",
+    "g",
+    "q",
+    "ɢ",
+    "ʔ",
+    /* extensions */ "ɡ",
+  ],
+  nasal: ["m", "ɱ", "n", "ɳ", "ɲ", "ŋ", "ɴ", "n̩"],
+  trill: ["ʙ", "r", "ʀ"],
+  tapOrFlap: ["ⱱ", "ɾ", "ɽ"],
+  fricative: [
+    "ɸ",
+    "β",
+    "f",
+    "v",
+    "θ",
+    "ð",
+    "s",
+    "z",
+    "ʃ",
+    "ʒ",
+    "ʂ",
+    "ʐ",
+    "ç",
+    "ʝ",
+    "x",
+    "ɣ",
+    "χ",
+    "ʁ",
+    "ħ",
+    "ʕ",
+    "h",
+    "ɦ",
+  ],
+  lateralFricative: ["ɬ", "ɮ"],
+  affricate: ["tʃ", "ʈʃ", "dʒ"], // very incomplete, there are many others
+  approximant: ["ʋ", "ɹ", "ɻ", "j", "ɰ", /* extensions */ "w"],
+  lateralApproximant: ["l", "ɭ", "ʎ", "ʟ"],
+};
+
+export const IPA_VOWELS: { [key: string]: string[] } = {
+  close: ["i", "yɨ", "ʉɯ", "u", "iː"],
+  closeOther: ["ɪ", "ʏ", "ʊ", "ɨ", "ᵻ"],
+  closeMid: ["e", "ø", "ɘ", "ɵ", "ɤ", "o", "ə", "oː"],
+  openMid: ["ɛ", "œ", "ɜ", "ɞ", "ʌ", "ɔ", "ɜː", "uː", "ɔː", "ɛː"],
+  open: ["æ", "a", "ɶ", "ɐ", "ɑ", "ɒ", "ɑː"],
+  rhotic: ["◌˞", "ɚ", "ɝ", "ɹ̩"],
+  diphtongs: [
+    "eɪ",
+    "əʊ",
+    "oʊ",
+    "aɪ",
+    "ɔɪ",
+    "aʊ",
+    "iə",
+    "ɜr",
+    "ɑr",
+    "ɔr",
+    "oʊr",
+    "oːɹ",
+    "ir",
+    "ɪɹ",
+    "ɔːɹ",
+    "ɑːɹ",
+    "ʊɹ",
+    "ʊr",
+    "ɛr",
+    "ɛɹ",
+    "əl",
+    "aɪɚ",
+    "aɪə",
+  ],
+};
+
+export const IPA_MAPPINGS: { [key: string]: string } = {
  p: "p",
  b: "b",
  t: "t",
--- a/enjoy/src/renderer/components/conversations/conversation-form.tsx
+++ b/enjoy/src/renderer/components/conversations/conversation-form.tsx
@@ -36,7 +36,11 @@ import {
 } from "@renderer/context";
 import { LoaderIcon } from "lucide-react";
 import { useNavigate } from "react-router-dom";
-import { GPT_PROVIDERS, TTS_PROVIDERS, GPTShareButton } from "@renderer/components";
+import {
+  GPT_PROVIDERS,
+  TTS_PROVIDERS,
+  GPTShareButton,
+} from "@renderer/components";

 const conversationFormSchema = z.object({
  name: z.string().optional(),
@@ -73,12 +77,22 @@ export const ConversationForm = (props: {
 }) => {
  const { conversation, onFinish } = props;
  const [submitting, setSubmitting] = useState<boolean>(false);
-  const [providers, setProviders] = useState<any>(GPT_PROVIDERS);
-  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const [gptProviders, setGptProviders] = useState<any>(GPT_PROVIDERS);
+  const [ttsProviders, setTtsProviders] = useState<any>(TTS_PROVIDERS);
+  const { EnjoyApp, webApi } = useContext(AppSettingsProviderContext);
  const { openai } = useContext(AISettingsProviderContext);
  const navigate = useNavigate();

-  const refreshProviders = async () => {
+  const refreshGptProviders = async () => {
+    let providers = GPT_PROVIDERS;
+
+    try {
+      const config = await webApi.config("gpt_providers");
+      providers = Object.assign(providers, config);
+    } catch (e) {
+      console.warn(`Failed to fetch remote GPT config: ${e.message}`);
+    }
+
    try {
      const response = await fetch(providers["ollama"]?.baseUrl + "/api/tags");
      providers["ollama"].models = (await response.json()).models.map(
@@ -87,7 +101,8 @@ export const ConversationForm = (props: {
    } catch (e) {
      console.warn(`No ollama server found: ${e.message}`);
    }
-    setProviders({ ...providers });
+
+    setGptProviders({ ...providers });
  };

  const destroyConversation = async () => {
@@ -98,8 +113,22 @@ export const ConversationForm = (props: {
    });
  };

+  const refreshTtsProviders = async () => {
+    let providers = TTS_PROVIDERS;
+
+    try {
+      const config = await webApi.config("tts_providers");
+      providers = Object.assign(providers, config);
+    } catch (e) {
+      console.warn(`Failed to fetch remote TTS config: ${e.message}`);
+    }
+
+    setTtsProviders({ ...providers });
+  };
+
  useEffect(() => {
-    refreshProviders();
+    refreshGptProviders();
+    refreshTtsProviders();
  }, []);

  const defaultConfig = JSON.parse(JSON.stringify(conversation || {}));
@@ -116,7 +145,7 @@ export const ConversationForm = (props: {
  }

  if (defaultConfig.configuration.tts?.engine === "openai" && openai) {
-    if (!defaultConfig.configuration.tts.baseUrl) {
+    if (!defaultConfig.configuration.tts?.baseUrl) {
      defaultConfig.configuration.tts.baseUrl = openai.baseUrl;
    }
  }
@@ -165,7 +194,8 @@ export const ConversationForm = (props: {
    }

    // use default base url if not set
-    if (!configuration.tts.baseUrl) {
+    if (!configuration?.tts?.baseUrl) {
+      configuration.tts ||= {};
      configuration.tts.baseUrl = GPT_PROVIDERS[engine]?.baseUrl;
    }

@@ -273,21 +303,15 @@ export const ConversationForm = (props: {
                          </SelectTrigger>
                        </FormControl>
                        <SelectContent>
-                          {Object.keys(providers)
-                            .filter((key) =>
-                              GPT_PROVIDERS[key].types.includes(
-                                form.watch("configuration.type")
-                              )
-                            )
-                            .map((key) => (
-                              <SelectItem key={key} value={key}>
-                                {providers[key].name}
-                              </SelectItem>
-                            ))}
+                          {Object.keys(gptProviders).map((key) => (
+                            <SelectItem key={key} value={key}>
+                              {gptProviders[key].name}
+                            </SelectItem>
+                          ))}
                        </SelectContent>
                      </Select>
                      <FormDescription>
-                        {providers[form.watch("engine")]?.description}
+                        {gptProviders[form.watch("engine")]?.description}
                      </FormDescription>
                      <FormMessage />
                    </FormItem>
@@ -309,13 +333,13 @@ export const ConversationForm = (props: {
                          </SelectTrigger>
                        </FormControl>
                        <SelectContent>
-                          {(providers[form.watch("engine")]?.models || []).map(
-                            (option: string) => (
-                              <SelectItem key={option} value={option}>
-                                {option}
-                              </SelectItem>
-                            )
-                          )}
+                          {(
+                            gptProviders[form.watch("engine")]?.models || []
+                          ).map((option: string) => (
+                            <SelectItem key={option} value={option}>
+                              {option}
+                            </SelectItem>
+                          ))}
                        </SelectContent>
                      </Select>
                      <FormMessage />
@@ -573,9 +597,9 @@ export const ConversationForm = (props: {
                      </SelectTrigger>
                    </FormControl>
                    <SelectContent>
-                      {Object.keys(TTS_PROVIDERS).map((key) => (
+                      {Object.keys(ttsProviders).map((key) => (
                        <SelectItem key={key} value={key}>
-                          {TTS_PROVIDERS[key].name}
+                          {ttsProviders[key].name}
                        </SelectItem>
                      ))}
                    </SelectContent>
@@ -585,7 +609,7 @@ export const ConversationForm = (props: {
              )}
            />

-            {TTS_PROVIDERS[
+            {ttsProviders[
              form.watch("configuration.tts.engine")
            ]?.configurable.includes("model") && (
              <FormField
@@ -606,7 +630,7 @@ export const ConversationForm = (props: {
                      </FormControl>
                      <SelectContent>
                        {(
-                          TTS_PROVIDERS[form.watch("configuration.tts.engine")]
+                          ttsProviders[form.watch("configuration.tts.engine")]
                            ?.models || []
                        ).map((model: string) => (
                          <SelectItem key={model} value={model}>
@@ -621,7 +645,7 @@ export const ConversationForm = (props: {
              />
            )}

-            {TTS_PROVIDERS[
+            {ttsProviders[
              form.watch("configuration.tts.engine")
            ]?.configurable.includes("voice") && (
              <FormField
@@ -642,7 +666,7 @@ export const ConversationForm = (props: {
                      </FormControl>
                      <SelectContent>
                        {(
-                          TTS_PROVIDERS[form.watch("configuration.tts.engine")]
+                          ttsProviders[form.watch("configuration.tts.engine")]
                            ?.voices || []
                        ).map((voice: string) => (
                          <SelectItem key={voice} value={voice}>
@@ -657,7 +681,7 @@ export const ConversationForm = (props: {
              />
            )}

-            {TTS_PROVIDERS[
+            {ttsProviders[
              form.watch("configuration.tts.engine")
            ]?.configurable.includes("baseUrl") && (
              <FormField
--- a/enjoy/src/renderer/components/conversations/gpt-providers.tsx
+++ b/enjoy/src/renderer/components/conversations/gpt-providers.tsx
@@ -4,21 +4,15 @@ export const GPT_PROVIDERS: { [key: string]: any } = {
  enjoyai: {
    name: "EnjoyAI",
    models: [
-      "gpt-3.5-turbo-0125",
-      "gpt-3.5-turbo",
-      "gpt-3.5-turbo-1106",
-      "gpt-3.5-turbo-16k",
-      "gpt-3.5-turbo-instruct",
+      "gpt-4o",
      "gpt-4-turbo",
-      "gpt-4-turbo-2024-04-09",
-      "gpt-4-0125-preview",
      "gpt-4-turbo-preview",
-      "gpt-4-1106-preview",
      "gpt-4-vision-preview",
      "gpt-4",
      "gpt-4-32k",
-      "gpt-4-0613",
-      "gpt-4-32k-0613",
+      "gpt-3.5-turbo",
+      "gpt-3.5-turbo-16k",
+      "gpt-3.5-turbo-instruct",
    ],
    configurable: [
      "model",
@@ -31,27 +25,20 @@ export const GPT_PROVIDERS: { [key: string]: any } = {
      "historyBufferSize",
      "tts",
    ],
-    types: ["gpt", "tts"],
  },
  openai: {
    name: "OpenAI",
    description: t("youNeedToSetupApiKeyBeforeUsingOpenAI"),
    models: [
-      "gpt-3.5-turbo-0125",
-      "gpt-3.5-turbo",
-      "gpt-3.5-turbo-1106",
-      "gpt-3.5-turbo-16k",
-      "gpt-3.5-turbo-instruct",
+      "gpt-4o",
      "gpt-4-turbo",
-      "gpt-4-turbo-2024-04-09",
-      "gpt-4-0125-preview",
      "gpt-4-turbo-preview",
-      "gpt-4-1106-preview",
      "gpt-4-vision-preview",
      "gpt-4",
      "gpt-4-32k",
-      "gpt-4-0613",
-      "gpt-4-32k-0613",
+      "gpt-3.5-turbo",
+      "gpt-3.5-turbo-16k",
+      "gpt-3.5-turbo-instruct",
    ],
    configurable: [
      "model",
@@ -65,7 +52,6 @@ export const GPT_PROVIDERS: { [key: string]: any } = {
      "historyBufferSize",
      "tts",
    ],
-    types: ["gpt", "tts"],
  },
  googleGenerativeAi: {
    name: "Google Generative AI",
@@ -78,7 +64,6 @@ export const GPT_PROVIDERS: { [key: string]: any } = {
      "historyBufferSize",
      "tts",
    ],
-    types: ["gpt"],
  },
  ollama: {
    name: "Ollama",
@@ -96,6 +81,5 @@ export const GPT_PROVIDERS: { [key: string]: any } = {
      "presencePenalty",
      "tts",
    ],
-    types: ["gpt"],
  },
 };
--- a/enjoy/src/renderer/components/conversations/tts-providers.tsx
+++ b/enjoy/src/renderer/components/conversations/tts-providers.tsx
@@ -14,4 +14,4 @@ export const TTS_PROVIDERS: { [key: string]: any } = {
    voices: ["alloy", "echo", "fable", "onyx", "nova", "shimmer"],
    configurable: ["model", "voice", "baseUrl"],
  },
-};
+};
--- a/enjoy/src/renderer/components/medias/media-caption.tsx
+++ b/enjoy/src/renderer/components/medias/media-caption.tsx
@@ -19,7 +19,7 @@ import {
  Timeline,
  TimelineEntry,
 } from "echogarden/dist/utilities/Timeline.d.js";
-import { convertIpaToNormal } from "@/utils";
+import { convertWordIpaToNormal } from "@/utils";
 import { useCopyToClipboard } from "@uidotdev/usehooks";
 import { MediaCaptionTabs } from "./media-captions";

@@ -37,6 +37,7 @@ export const MediaCaption = () => {
    editingRegion,
    setEditingRegion,
    setTranscriptionDraft,
+    ipaMappings,
  } = useContext(MediaPlayerProviderContext);
  const { EnjoyApp } = useContext(AppSettingsProviderContext);
  const [activeIndex, setActiveIndex] = useState<number>(0);
@@ -411,12 +412,12 @@ export const MediaCaption = () => {
            if (displayIpa) {
              const text = caption.timeline
                .map((word) => {
-                  const ipa = word.timeline
-                    .map((t) =>
-                      t.timeline.map((s) => convertIpaToNormal(s.text)).join("")
-                    )
-                    .join(" · ");
-                  return `${word.text}(${ipa})`;
+                  const ipas = word.timeline.map((t) =>
+                    t.timeline.map((s) => s.text).join("")
+                  );
+                  return `${word.text}(${convertWordIpaToNormal(ipas, {
+                    mappings: ipaMappings,
+                  }).join("")})`;
                })
                .join(" ");

@@ -475,13 +476,18 @@ const Caption = (props: {
    onClick,
  } = props;

-  const { currentNotes } = useContext(MediaPlayerProviderContext);
+  const { currentNotes, ipaMappings } = useContext(MediaPlayerProviderContext);
  const notes = currentNotes.filter((note) => note.parameters?.quoteIndices);
  const [notedquoteIndices, setNotedquoteIndices] = useState<number[]>([]);

  let words = caption.text.split(" ");
  const ipas = caption.timeline.map((w) =>
-    w.timeline.map((t) => t.timeline.map((s) => s.text))
+    w.timeline.map((t) =>
+      convertWordIpaToNormal(
+        t.timeline.map((s) => s.text),
+        { mappings: ipaMappings }
+      ).join("")
+    )
  );

  if (words.length !== caption.timeline.length) {
--- a/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx
+++ b/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx
@@ -3,7 +3,7 @@ import { MediaPlayerProviderContext } from "@renderer/context";
 import { TabsContent, Separator } from "@renderer/components/ui";
 import { t } from "i18next";
 import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
-import { convertIpaToNormal } from "@/utils";
+import { convertWordIpaToNormal } from "@/utils";
 import {
  CamdictLookupResult,
  AiLookupResult,
@@ -37,7 +37,7 @@ const SelectedWords = (props: {
 }) => {
  const { selectedIndices, caption } = props;

-  const { transcription } = useContext(MediaPlayerProviderContext);
+  const { transcription, ipaMappings } = useContext(MediaPlayerProviderContext);

  const word = selectedIndices
    .map((index) => caption.timeline[index]?.text || "")
@@ -76,11 +76,14 @@ const SelectedWords = (props: {
                  >
                    {word.timeline
                      .map((t) =>
-                        t.timeline
-                          .map((s) => convertIpaToNormal(s.text))
-                          .join("")
+                        convertWordIpaToNormal(
+                          t.timeline.map((s) => s.text),
+                          {
+                            mappings: ipaMappings,
+                          }
+                        ).join("")
                      )
-                      .join("")}
+                      .join(" ")}
                  </span>
                </div>
              )}
--- a/enjoy/src/renderer/components/widgets/lookup-widget.tsx
+++ b/enjoy/src/renderer/components/widgets/lookup-widget.tsx
@@ -238,28 +238,19 @@ export const CamdictLookupResult = (props: { word: string }) => {
                    <span className="text-sm font-code">
                      /{pron.pronunciation}/
                    </span>
-                    {pron.audio && (
+                    {pron.audio && pron.audio.match(/\.mp3/i) && (
                      <div>
                        <Button
                          variant="ghost"
                          size="icon"
                          className="rounded-full p-0 w-6 h-6"
                          onClick={() => {
-                            const audio = document.getElementById(
-                              `${posItem.type}-${pron.region}`
-                            ) as HTMLAudioElement;
-                            if (audio) {
-                              audio.play();
-                            }
+                            const audio = new Audio(pron.audio);
+                            audio.play();
                          }}
                        >
                          <Volume2Icon className="w-4 h-4" />
                        </Button>
-                        <audio
-                          className="hidden"
-                          id={`${posItem.type}-${pron.region}`}
-                          src={pron.audio}
-                        />
                      </div>
                    )}
                  </div>
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -1,5 +1,5 @@
 import { createContext, useEffect, useState, useContext } from "react";
-import { extractFrequencies } from "@/utils";
+import { convertIpaToNormal, extractFrequencies } from "@/utils";
 import { AppSettingsProviderContext } from "@renderer/context";
 import {
  useTranscriptions,
@@ -13,10 +13,10 @@ import Regions, {
 } from "wavesurfer.js/dist/plugins/regions";
 import Chart from "chart.js/auto";
 import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
-import { IPA_MAPPING } from "@/constants";
 import { toast } from "@renderer/components/ui";
 import { Tooltip } from "react-tooltip";
 import { debounce } from "lodash";
+import { IPA_MAPPINGS } from "@/constants";

 type MediaPlayerContextType = {
  layout: {
@@ -86,6 +86,8 @@ type MediaPlayerContextType = {
  // Segments
  currentSegment: SegmentType;
  createSegment: () => Promise<SegmentType | void>;
+  // remote config
+  ipaMappings: { [key: string]: string };
 };

 export const MediaPlayerProviderContext =
@@ -118,7 +120,7 @@ export const MediaPlayerProvider = ({
  children: React.ReactNode;
 }) => {
  const minPxPerSec = 150;
-  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const { EnjoyApp, webApi } = useContext(AppSettingsProviderContext);

  const [layout, setLayout] = useState<{
    name: string;
@@ -160,6 +162,10 @@ export const MediaPlayerProvider = ({
  const [transcriptionDraft, setTranscriptionDraft] =
    useState<TranscriptionType["result"]>();

+  const [ipaMappings, setIpaMappings] = useState<{ [key: string]: string }>(
+    IPA_MAPPINGS
+  );
+
  const {
    transcription,
    generateTranscription,
@@ -331,7 +337,7 @@ export const MediaPlayerProvider = ({
          );
          labels[index] = [
            labels[index] || "",
-            (IPA_MAPPING as any)[phone.text.trim()] || phone.text.trim(),
+            convertIpaToNormal(phone.text.trim()),
          ].join("");
        });
      }
@@ -529,6 +535,10 @@ export const MediaPlayerProvider = ({
  useEffect(() => {
    calculateHeight();

+    webApi.config("ipa_mappings").then((mappings) => {
+      if (mappings) setIpaMappings(mappings);
+    });
+
    EnjoyApp.window.onResize(() => {
      deboundeCalculateHeight();
    });
@@ -584,6 +594,7 @@ export const MediaPlayerProvider = ({
          createNote,
          currentSegment: segment,
          createSegment,
+          ipaMappings,
        }}
      >
        {children}
--- a/enjoy/src/renderer/pages/conversations.tsx
+++ b/enjoy/src/renderer/pages/conversations.tsx
@@ -27,6 +27,11 @@ export default () => {
  const [searchParams] = useSearchParams();
  const [creating, setCreating] = useState<boolean>(false);
  const [preset, setPreset] = useState<any>({});
+  const [config, setConfig] = useState<any>({
+    gptPresets: [],
+    customPreset: {},
+    ttsPreset: {},
+  });
  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
  const { EnjoyApp, webApi } = useContext(AppSettingsProviderContext);
  const { currentEngine } = useContext(AISettingsProviderContext);
@@ -46,7 +51,7 @@ export default () => {
  }, []);

  useEffect(() => {
-    const postId = searchParams.get('postId');
+    const postId = searchParams.get("postId");
    if (!postId) return;

    webApi.post(postId).then((post) => {
@@ -57,8 +62,8 @@ export default () => {

      setPreset(preset);
      setCreating(true);
-    })
-  }, [searchParams.get('postId')])
+    });
+  }, [searchParams.get("postId")]);

  const fetchConversations = async () => {
    const _conversations = await EnjoyApp.conversations.findAll({});
@@ -78,57 +83,74 @@ export default () => {
    }
  };

-  const presets = CONVERSATION_PRESETS.map((preset) =>
-    Object.assign({}, preset, {
-      engine: currentEngine?.name,
+  const preparePresets = async () => {
+    let presets = CONVERSATION_PRESETS;
+    let defaultGptPreset = {
+      key: "custom",
+      engine: "enjoyai",
+      name: t("custom"),
      configuration: {
-        ...preset.configuration,
+        type: "gpt",
+        engine: currentEngine?.name || "enjoyai",
        tts: {
-          ...preset.configuration.tts,
-          engine: currentEngine?.name,
+          engine: currentEngine?.name || "enjoyai",
        },
      },
-    })
-  );
-
-  const customPreset = {
-    key: "custom",
-    name: t("custom"),
-    engine: currentEngine?.name,
-    configuration: {
-      type: "gpt",
-      model: "gpt-4-turbo",
-      baseUrl: "",
-      roleDefinition: "",
-      temperature: 0.2,
-      numberOfChoices: 1,
-      maxTokens: 2048,
-      presencePenalty: 0,
-      frequencyPenalty: 0,
-      historyBufferSize: 0,
-      tts: {
-        baseUrl: "",
-        engine: currentEngine?.name,
-        model: "tts-1",
-        voice: "alloy",
+    };
+    let defaultTtsPreset = {
+      key: "tts",
+      name: "TTS",
+      engine: "enjoyai",
+      configuration: {
+        type: "tts",
+        tts: {
+          engine: currentEngine?.name || "enjoyai",
+        },
      },
-    },
+    };
+
+    try {
+      const gptPresets: any[] = await webApi.config("gpt_presets");
+      const defaultGpt = await webApi.config("default_gpt_preset");
+      const defaultTts = await webApi.config("default_tts_preset");
+
+      presets = gptPresets;
+      defaultGpt.key = "custom";
+      defaultGpt.name = t("custom");
+      defaultGpt.engine = currentEngine?.name || "enjoyai";
+      defaultGpt.configuration.tts.engine = currentEngine?.name || "enjoyai";
+      defaultGptPreset = defaultGpt;
+
+      defaultTts.engine = currentEngine?.name || "enjoyai";
+      defaultTts.configuration.tts.engine = currentEngine?.name || "enjoyai";
+      defaultTtsPreset = defaultTts;
+    } catch (error) {
+      console.error(error);
+    }
+
+    const gptPresets = presets.map((preset) =>
+      Object.assign({}, preset, {
+        engine: currentEngine?.name,
+        configuration: {
+          ...preset.configuration,
+          tts: {
+            ...preset.configuration.tts,
+            engine: currentEngine?.name,
+          },
+        },
+      })
+    );
+
+    setConfig({
+      gptPresets,
+      customPreset: defaultGptPreset,
+      ttsPreset: defaultTtsPreset,
+    });
  };

-  const ttsPreset = {
-    key: "tts",
-    name: "TTS",
-    engine: "openai",
-    configuration: {
-      type: "tts",
-      tts: {
-        baseUrl: "",
-        engine: currentEngine?.name,
-        model: "tts-1",
-        voice: "alloy",
-      },
-    },
-  };
+  useEffect(() => {
+    preparePresets();
+  }, []);

  return (
    <div className="h-full px-4 py-6 lg:px-8 flex flex-col">
@@ -161,7 +183,7 @@ export default () => {
                  {t("chooseFromPresetGpts")}
                </div>
                <ScrollArea className="h-64 pr-4">
-                  {presets.map((preset) => (
+                  {config.gptPresets.map((preset: any) => (
                    <DialogTrigger
                      key={preset.key}
                      data-testid={`conversation-preset-${preset.key}`}
@@ -187,9 +209,9 @@ export default () => {
              <div className="grid grid-cols-2 gap-4 mb-6">
                <DialogTrigger asChild>
                  <Button
-                    data-testid={`conversation-preset-${customPreset.key}`}
+                    data-testid={`conversation-preset-${config.customPreset.key}`}
                    onClick={() => {
-                      setPreset(customPreset);
+                      setPreset(config.customPreset);
                      setCreating(true);
                    }}
                    variant="secondary"
@@ -198,19 +220,21 @@ export default () => {
                    {t("custom")} GPT
                  </Button>
                </DialogTrigger>
-                <DialogTrigger asChild>
-                  <Button
-                    data-testid={`conversation-preset-${ttsPreset.key}`}
-                    onClick={() => {
-                      setPreset(ttsPreset);
-                      setCreating(true);
-                    }}
-                    variant="secondary"
-                    className="w-full"
-                  >
-                    TTS
-                  </Button>
-                </DialogTrigger>
+                {config.ttsPreset.key && (
+                  <DialogTrigger asChild>
+                    <Button
+                      data-testid={`conversation-preset-${config.ttsPreset.key}`}
+                      onClick={() => {
+                        setPreset(config.ttsPreset);
+                        setCreating(true);
+                      }}
+                      variant="secondary"
+                      className="w-full"
+                    >
+                      TTS
+                    </Button>
+                  </DialogTrigger>
+                )}
              </div>
            </DialogContent>
          </Dialog>
@@ -234,7 +258,7 @@ export default () => {
              style={{
                borderLeftColor: `#${conversation.id
                  .replaceAll("-", "")
-                  .substr(0, 6)}`,
+                  .slice(0, 6)}`,
                borderLeftWidth: 3,
              }}
            >
--- a/enjoy/src/utils.ts
+++ b/enjoy/src/utils.ts
@@ -1,5 +1,5 @@
 import Pitchfinder from "pitchfinder";
-import { IPA_MAPPING } from "./constants";
+import { IPA_CONSONANTS, IPA_MAPPINGS, IPA_VOWELS } from "./constants";

 export const extractFrequencies = (props: {
  peaks: Float32Array;
@@ -19,15 +19,18 @@ export const extractFrequencies = (props: {
    quantization: bpm,
  });

-  const cleanedFrequencies = removeNoise(frequencies)
+  const cleanedFrequencies = removeNoise(frequencies);

  return cleanedFrequencies;
 };

-export const removeNoise = (numbers: number[], threshold: number = 0.2): number[] => {
+export const removeNoise = (
+  numbers: number[],
+  threshold: number = 0.2
+): number[] => {
  numbers.forEach((num, i) => {
    if (i === 0) return;
-    if (typeof num !== 'number') return;
+    if (typeof num !== "number") return;

    const prevNum = numbers[i - 1] || num;
    const nextNum = numbers[i + 1] || num;
@@ -37,7 +40,7 @@ export const removeNoise = (numbers: number[], threshold: number = 0.2): number[
    if (deviation > threshold * avgNeighbor) {
      numbers[i] = null;
    }
-  })
+  });

  return numbers;
 };
@@ -53,12 +56,62 @@ export function milisecondsToTimestamp(ms: number) {
  )}:${seconds.padStart(2, "0")},${milliseconds}`;
 }

-export const convertIpaToNormal = (ipa: string) => {
+export const convertWordIpaToNormal = (
+  ipas: string[],
+  options?: { mappings?: any }
+): string[] => {
+  const { mappings = IPA_MAPPINGS } = options || {};
+  const consonants = Object.keys(IPA_CONSONANTS)
+    .map((key) => IPA_CONSONANTS[key])
+    .reduce((acc, val) => acc.concat(val), []);
+  const consonantsRegex = new RegExp(`^(\ˈ|ˌ)?` + consonants.join("|"));
+  const vowels = Object.keys(IPA_VOWELS)
+    .map((key) => IPA_VOWELS[key])
+    .reduce((acc, val) => acc.concat(val), []);
+  const vowelsRegex = new RegExp(`^(\ˈ|ˌ)?` + vowels.join("|"));
+
+  const converted: string[] = [];
+
+  // convert each ipa to normal
+  // if ipa is a vowel and marked, check if the previous ipa is a consonant,
+  // if so, mark the consonant instead
+  for (let i = 0; i < ipas.length; i++) {
+    const ipa = ipas[i];
+    converted.push(convertIpaToNormal(ipa, { mappings, marked: false }));
+
+    const isVowel = vowelsRegex.test(ipa);
+    const mark = ipa.match(/(\ˈ|ˌ)/);
+
+    let j = i - 1;
+    for (; j >= 0; j--) {
+      if (consonantsRegex.test(ipas[j]) && !consonantsRegex.test(ipas[j - 1])) {
+        break;
+      }
+    }
+
+    if (isVowel && mark) {
+      if (ipas[j]) {
+        converted[j] = mark[0] + converted[j];
+      } else {
+        converted[i] = mark[0] + converted[i];
+      }
+    }
+  }
+
+  return converted;
+};
+
+export const convertIpaToNormal = (
+  ipa: string,
+  options?: { mappings?: any; marked?: boolean }
+): string => {
+  const { mappings = IPA_MAPPINGS, marked = false } = options || {};
+
  const mark = ipa.match(/(\ˈ|ˌ)/);
  const cleanIpa = ipa.replace(mark ? mark[0] : "", "");

-  const converted = IPA_MAPPING[cleanIpa] || cleanIpa;
-  if (mark) {
+  const converted = mappings[cleanIpa] || cleanIpa;
+  if (mark && marked) {
    return `${mark[0]}${converted}`;
  } else {
    return converted;