Feat: customize settings before transcribing (#699)

* transcribe with language * avoid using .en model to transcribe un-English audio * save lanuage in transcription/audio/video * may select language when regenerate transcription * may select service when re-generate * refactor transcription form * refactor transcription create form * refactor media loading modal * display ipa per language * refactor ipa mappings * parse subtitle files
2024-06-24 14:35:09 +08:00
parent 7f4395354e
commit 3b83861749
31 changed files with 695 additions and 525 deletions
--- a/enjoy/src/api/client.ts
+++ b/enjoy/src/api/client.ts
@@ -260,7 +260,9 @@ export class Client {
    return this.api.post("/api/transcriptions", decamelizeKeys(transcription));
  }

-  syncSegment(segment: Partial<Omit<SegmentType, "audio" | "video">>) {
+  syncSegment(
+    segment: Partial<Omit<SegmentType, "audio" | "video" | "target">>
+  ) {
    return this.api.post("/api/segments", decamelizeKeys(segment));
  }

--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -604,5 +604,8 @@
  "referenceText": "Reference text",
  "inputReferenceTextOrLeaveItBlank": "Input the reference text or leave it blank",
  "assessing": "Assessing",
-  "assessedSuccessfully": "Assessed successfully"
+  "assessedSuccessfully": "Assessed successfully",
+  "optinal": "Optional",
+  "uploadTranscriptFile": "Upload transcript file(.txt/.srt/.vtt)",
+  "onlyTextFileIsSupported": "Only text file is supported"
 }
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -604,5 +604,8 @@
  "referenceText": "参考文本",
  "inputReferenceTextOrLeaveItBlank": "输入参考文本，或者留空",
  "assessing": "正在评估",
-  "assessedSuccessfully": "评估成功"
+  "assessedSuccessfully": "评估成功",
+  "optinal": "可选",
+  "uploadTranscriptFile": "上传字幕文件(.txt/.srt/.vtt)",
+  "onlyTextFileIsSupported": "仅支持文本文件"
 }
--- a/enjoy/src/main/db/handlers/audios-handler.ts
+++ b/enjoy/src/main/db/handlers/audios-handler.ts
@@ -11,10 +11,10 @@ const logger = log.scope("db/handlers/audios-handler");

 class AudiosHandler {
  private async findAll(
-    event: IpcMainEvent,
+    _event: IpcMainEvent,
    options: FindOptions<Attributes<Audio>>
  ) {
-    return Audio.findAll({
+    const audios = await Audio.findAll({
      order: [["updatedAt", "DESC"]],
      include: [
        {
@@ -25,46 +25,30 @@ class AudiosHandler {
        },
      ],
      ...options,
-    })
-      .then((audios) => {
-        if (!audios) {
-          return [];
-        }
-        return audios.map((audio) => audio.toJSON());
-      })
-      .catch((err) => {
-        event.sender.send("on-notification", {
-          type: "error",
-          message: err.message,
-        });
-      });
+    });
+
+    if (!audios) {
+      return [];
+    }
+    return audios.map((audio) => audio.toJSON());
  }

  private async findOne(
-    event: IpcMainEvent,
+    _event: IpcMainEvent,
    where: WhereOptions<Attributes<Audio>>
  ) {
-    return Audio.findOne({
+    const audio = await Audio.findOne({
      where: {
        ...where,
      },
-    })
-      .then((audio) => {
-        if (!audio) return;
+    });
+    if (!audio) return;

-        if (!audio.isSynced) {
-          audio.sync().catch(() => {});
-        }
+    if (!audio.isSynced) {
+      audio.sync().catch(() => {});
+    }

-        return audio.toJSON();
-      })
-      .catch((err) => {
-        logger.error(err);
-        event.sender.send("on-notification", {
-          type: "error",
-          message: err.message,
-        });
-      });
+    return audio.toJSON();
  }

  private async create(
@@ -79,22 +63,15 @@ class AudiosHandler {
    let file = uri;
    let source;
    if (uri.startsWith("http")) {
-      try {
-        if (youtubedr.validateYtURL(uri)) {
-          file = await youtubedr.autoDownload(uri);
-        } else {
-          file = await downloader.download(uri, {
-            webContents: event.sender,
-          });
-        }
-        if (!file) throw new Error("Failed to download file");
-        source = uri;
-      } catch (err) {
-        return event.sender.send("on-notification", {
-          type: "error",
-          message: t("models.audio.failedToDownloadFile", { file: uri }),
+      if (youtubedr.validateYtURL(uri)) {
+        file = await youtubedr.autoDownload(uri);
+      } else {
+        file = await downloader.download(uri, {
+          webContents: event.sender,
        });
      }
+      if (!file) throw new Error("Failed to download file");
+      source = uri;
    }

    try {
@@ -119,73 +96,42 @@ class AudiosHandler {

      return audio.toJSON();
    } catch (err) {
-      return event.sender.send("on-notification", {
-        type: "error",
-        message: t("models.audio.failedToAdd", { error: err.message }),
-      });
+      logger.error(err);
+      throw err;
    }
  }

  private async update(
-    event: IpcMainEvent,
+    _event: IpcMainEvent,
    id: string,
    params: Attributes<Audio>
  ) {
-    const { name, description, metadata } = params;
+    const { name, description, metadata, language } = params;

-    return Audio.findOne({
-      where: { id },
-    })
-      .then((audio) => {
-        if (!audio) {
-          throw new Error(t("models.audio.notFound"));
-        }
-        audio.update({ name, description, metadata });
-      })
-      .catch((err) => {
-        event.sender.send("on-notification", {
-          type: "error",
-          message: err.message,
-        });
-      });
+    const audio = await Audio.findByPk(id);
+
+    if (!audio) {
+      throw new Error(t("models.audio.notFound"));
+    }
+    return await audio.update({ name, description, metadata, language });
  }

-  private async destroy(event: IpcMainEvent, id: string) {
-    return Audio.findOne({
-      where: { id },
-    }).then((audio) => {
-      if (!audio) {
-        event.sender.send("on-notification", {
-          type: "error",
-          message: t("models.audio.notFound"),
-        });
-      }
-      audio.destroy();
-    });
+  private async destroy(_event: IpcMainEvent, id: string) {
+    const audio = await Audio.findByPk(id);
+
+    if (!audio) {
+      throw new Error(t("models.audio.notFound"));
+    }
+    return await audio.destroy();
  }

  private async upload(event: IpcMainEvent, id: string) {
-    const audio = await Audio.findOne({
-      where: { id },
-    });
+    const audio = await Audio.findByPk(id);
    if (!audio) {
-      event.sender.send("on-notification", {
-        type: "error",
-        message: t("models.audio.notFound"),
-      });
+      throw new Error(t("models.audio.notFound"));
    }

-    audio
-      .upload()
-      .then((res) => {
-        return res;
-      })
-      .catch((err) => {
-        event.sender.send("on-notification", {
-          type: "error",
-          message: err.message,
-        });
-      });
+    return await audio.upload();
  }

  private async crop(
@@ -193,9 +139,7 @@ class AudiosHandler {
    id: string,
    params: { startTime: number; endTime: number }
  ) {
-    const audio = await Audio.findOne({
-      where: { id },
-    });
+    const audio = await Audio.findByPk(id);
    if (!audio) {
      throw new Error(t("models.audio.notFound"));
    }
--- a/enjoy/src/main/db/handlers/transcriptions-handler.ts
+++ b/enjoy/src/main/db/handlers/transcriptions-handler.ts
@@ -5,7 +5,7 @@ import log from "@main/logger";

 const logger = log.scope("db/handlers/transcriptions-handler");
 class TranscriptionsHandler {
-  private async findOrCreate(event: IpcMainEvent, where: Transcription) {
+  private async findOrCreate(_event: IpcMainEvent, where: Transcription) {
    try {
      const { targetType, targetId } = where;
      let target: Video | Audio = null;
@@ -31,10 +31,8 @@ class TranscriptionsHandler {

      return transcription.toJSON();
    } catch (err) {
-      event.sender.send("on-notification", {
-        type: "error",
-        message: err.message,
-      });
+      logger.error(err);
+      throw err;
    }
  }

@@ -43,24 +41,19 @@ class TranscriptionsHandler {
    id: string,
    params: Attributes<Transcription>
  ) {
-    const { result, engine, model, state } = params;
+    const { result, engine, model, state, language } = params;

-    return Transcription.findOne({
-      where: { id },
-    })
-      .then((transcription) => {
-        if (!transcription) {
-          throw new Error("models.transcription.notFound");
-        }
-        transcription.update({ result, engine, model, state });
-      })
-      .catch((err) => {
-        logger.error(err);
-        event.sender.send("on-notification", {
-          type: "error",
-          message: err.message,
-        });
-      });
+    const transcription = await Transcription.findByPk(id);
+    if (!transcription) {
+      throw new Error("models.transcription.notFound");
+    }
+    return await transcription.update({
+      result,
+      engine,
+      model,
+      state,
+      language,
+    });
  }

  register() {
--- a/enjoy/src/main/db/handlers/videos-handler.ts
+++ b/enjoy/src/main/db/handlers/videos-handler.ts
@@ -11,10 +11,10 @@ const logger = log.scope("db/handlers/videos-handler");

 class VideosHandler {
  private async findAll(
-    event: IpcMainEvent,
+    _event: IpcMainEvent,
    options: FindOptions<Attributes<Video>>
  ) {
-    return Video.findAll({
+    const videos = await Video.findAll({
      order: [["updatedAt", "DESC"]],
      include: [
        {
@@ -25,46 +25,29 @@ class VideosHandler {
        },
      ],
      ...options,
-    })
-      .then((videos) => {
-        if (!videos) {
-          return [];
-        }
-        return videos.map((video) => video.toJSON());
-      })
-      .catch((err) => {
-        event.sender.send("on-notification", {
-          type: "error",
-          message: err.message,
-        });
-      });
+    });
+    if (!videos) {
+      return [];
+    }
+    return videos.map((video) => video.toJSON());
  }

  private async findOne(
-    event: IpcMainEvent,
+    _event: IpcMainEvent,
    where: WhereOptions<Attributes<Video>>
  ) {
-    return Video.findOne({
+    const video = await Video.findOne({
      where: {
        ...where,
      },
-    })
-      .then((video) => {
-        if (!video) return;
+    });
+    if (!video) return;

-        if (!video.isSynced) {
-          video.sync().catch(() => {});
-        }
+    if (!video.isSynced) {
+      video.sync().catch(() => {});
+    }

-        return video.toJSON();
-      })
-      .catch((err) => {
-        logger.error(err);
-        event.sender.send("on-notification", {
-          type: "error",
-          message: err.message,
-        });
-      });
+    return video.toJSON();
  }

  private async create(
@@ -90,10 +73,8 @@ class VideosHandler {
        if (!file) throw new Error("Failed to download file");
        source = uri;
      } catch (err) {
-        return event.sender.send("on-notification", {
-          type: "error",
-          message: t("models.video.failedToDownloadFile", { file: uri }),
-        });
+        logger.error(err);
+        throw new Error(t("models.video.failedToDownloadFile", { file: uri }));
      }
    }

@@ -105,72 +86,46 @@ class VideosHandler {
        return video.toJSON();
      })
      .catch((err) => {
-        return event.sender.send("on-notification", {
-          type: "error",
-          message: t("models.video.failedToAdd", { error: err.message }),
-        });
+        logger.error(err);
+        throw new Error(t("models.video.failedToAdd", { error: err.message }));
      });
  }

  private async update(
-    event: IpcMainEvent,
+    _event: IpcMainEvent,
    id: string,
    params: Attributes<Video>
  ) {
-    const { name, description, metadata } = params;
+    const { name, description, metadata, language } = params;

-    return Video.findOne({
-      where: { id },
-    })
-      .then((video) => {
-        if (!video) {
-          throw new Error(t("models.video.notFound"));
-        }
-        video.update({ name, description, metadata });
-      })
-      .catch((err) => {
-        event.sender.send("on-notification", {
-          type: "error",
-          message: err.message,
-        });
-      });
+    const video = await Video.findByPk(id);
+    if (!video) {
+      throw new Error(t("models.video.notFound"));
+    }
+    video.update({ name, description, metadata, language });
  }

  private async destroy(event: IpcMainEvent, id: string) {
-    return Video.findOne({
-      where: { id },
-    }).then((video) => {
-      if (!video) {
-        event.sender.send("on-notification", {
-          type: "error",
-          message: t("models.video.notFound"),
-        });
-      }
-      video.destroy();
-    });
+    const video = await Video.findByPk(id);
+    if (!video) {
+      throw new Error(t("models.video.notFound"));
+    }
+    return await video.destroy();
  }

  private async upload(event: IpcMainEvent, id: string) {
-    const video = await Video.findOne({
-      where: { id },
-    });
+    const video = await Video.findByPk(id);
    if (!video) {
-      event.sender.send("on-notification", {
-        type: "error",
-        message: t("models.video.notFound"),
-      });
+      throw new Error(t("models.video.notFound"));
    }
-
    video
      .upload()
      .then((res) => {
        return res;
      })
      .catch((err) => {
-        event.sender.send("on-notification", {
-          type: "error",
-          message: err.message,
-        });
+        logger.error(err);
+        throw err;
      });
  }

--- a/enjoy/src/main/db/models/segment.ts
+++ b/enjoy/src/main/db/models/segment.ts
@@ -70,6 +70,9 @@ export class Segment extends Model<Segment> {
  @Column(DataType.DATE)
  uploadedAt: Date;

+  @Column(DataType.VIRTUAL)
+  target: Audio | Video;
+
  @BelongsTo(() => Audio, { foreignKey: "targetId", constraints: false })
  audio: Audio;

@@ -208,6 +211,22 @@ export class Segment extends Model<Segment> {
        logger.error("sync error", err);
      });
    });
+
+    if (!Array.isArray(segments)) segments = [segments];
+
+    for (const instance of segments) {
+      if (instance.targetType === "Audio" && instance.audio) {
+        instance.target = instance.audio.toJSON();
+      }
+      if (instance.targetType === "Video" && instance.video) {
+        instance.target = instance.video.toJSON();
+      }
+      // To prevent mistakes:
+      delete instance.audio;
+      delete instance.dataValues.audio;
+      delete instance.video;
+      delete instance.dataValues.video;
+    }
  }

  @AfterCreate
--- a/enjoy/src/main/whisper.ts
+++ b/enjoy/src/main/whisper.ts
@@ -147,6 +147,7 @@ class Whipser {
      };
    },
    options?: {
+      language?: string;
      force?: boolean;
      extra?: string[];
      onProgress?: (progress: number) => void;
@@ -174,9 +175,13 @@ class Whipser {
      throw new Error("No file or blob provided");
    }

-    const model = this.currentModel();
+    const { force = false, extra = [], language, onProgress } = options || {};
+
+    const model = this.currentModel();
+    if (language && !language.startsWith("en") && model.name.includes("en")) {
+      throw new Error(`Model ${model.name} does not support ${language}`);
+    }

-    const { force = false, extra = [], onProgress } = options || {};
    const filename = path.basename(file, path.extname(file));
    const tmpDir = settings.cachePath();
    const outputFile = path.join(tmpDir, filename + ".json");
@@ -197,7 +202,7 @@ class Whipser {
      path.join(tmpDir, filename),
      "--print-progress",
      "--language",
-      model.name.includes("en") ? "en" : "auto",
+      model.name.includes("en") ? "en" : language?.split("-")?.[0] || "auto",
      ...extra,
    ];

@@ -252,7 +257,7 @@ class Whipser {
      return this.config;
    });

-    ipcMain.handle("whisper-set-model", async (event, model) => {
+    ipcMain.handle("whisper-set-model", async (_event, model) => {
      const originalModel = settings.getSync("whisper.model");
      settings.setSync("whisper.model", model);
      this.config = settings.whisperConfig();
@@ -267,35 +272,22 @@ class Whipser {
        })
        .catch((err) => {
          settings.setSync("whisper.model", originalModel);
-          event.sender.send("on-notification", {
-            type: "error",
-            message: err.message,
-          });
+          throw err;
        });
    });

-    ipcMain.handle("whisper-set-service", async (event, service) => {
+    ipcMain.handle("whisper-set-service", async (_event, service) => {
      if (service === "local") {
-        try {
-          await this.check();
-          settings.setSync("whisper.service", service);
-          this.config.service = service;
-          return this.config;
-        } catch (err) {
-          event.sender.send("on-notification", {
-            type: "error",
-            message: err.message,
-          });
-        }
+        await this.check();
+        settings.setSync("whisper.service", service);
+        this.config.service = service;
+        return this.config;
      } else if (["cloudflare", "azure", "openai"].includes(service)) {
        settings.setSync("whisper.service", service);
        this.config.service = service;
        return this.config;
      } else {
-        event.sender.send("on-notification", {
-          type: "error",
-          message: "Unknown service",
-        });
+        throw new Error("Unknown service");
      }
    });

@@ -304,19 +296,12 @@ class Whipser {
    });

    ipcMain.handle("whisper-transcribe", async (event, params, options) => {
-      try {
-        return await this.transcribe(params, {
-          ...options,
-          onProgress: (progress) => {
-            event.sender.send("whisper-on-progress", progress);
-          },
-        });
-      } catch (err) {
-        event.sender.send("on-notification", {
-          type: "error",
-          message: err.message,
-        });
-      }
+      return await this.transcribe(params, {
+        ...options,
+        onProgress: (progress) => {
+          event.sender.send("whisper-on-progress", progress);
+        },
+      });
    });

    ipcMain.handle("whisper-abort", async (_event) => {
--- a/enjoy/src/preload.ts
+++ b/enjoy/src/preload.ts
@@ -454,6 +454,7 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
        };
      },
      options?: {
+        language?: string;
        force?: boolean;
        extra?: string[];
      }
--- a/enjoy/src/renderer/components/audios/audio-card.tsx
+++ b/enjoy/src/renderer/components/audios/audio-card.tsx
@@ -1,6 +1,7 @@
 import { Link } from "react-router-dom";
 import { cn } from "@renderer/lib/utils";
 import { AudioLinesIcon } from "lucide-react";
+import { Badge } from "@renderer/components/ui";

 export const AudioCard = (props: {
  audio: Partial<AudioType>;
@@ -12,9 +13,9 @@ export const AudioCard = (props: {
    <div className={cn("w-full", className)}>
      <Link to={`/audios/${audio.id}`}>
        <div
-          className="aspect-square border rounded-lg overflow-hidden flex"
+          className="aspect-square border rounded-lg overflow-hidden flex relative"
          style={{
-            borderBottomColor: `#${audio.md5.substr(0, 6)}`,
+            borderBottomColor: `#${audio.md5.slice(0, 6)}`,
            borderBottomWidth: 3,
          }}
        >
@@ -25,7 +26,11 @@ export const AudioCard = (props: {
              className="hover:scale-105 object-cover w-full h-full"
            />
          ) : (
-              <AudioLinesIcon className="hover:scale-105 object-cover w-1/2 h-1/2 m-auto" />
+            <AudioLinesIcon className="hover:scale-105 object-cover w-1/2 h-1/2 m-auto" />
+          )}
+
+          {audio.language && (
+            <Badge className="absolute left-2 top-2">{audio.language}</Badge>
          )}
        </div>
      </Link>
--- a/enjoy/src/renderer/components/index.ts
+++ b/enjoy/src/renderer/components/index.ts
@@ -10,6 +10,7 @@ export * from "./preferences";
 export * from "./pronunciation-assessments";
 export * from "./recordings";
 export * from "./stories";
+export * from "./transcriptions";
 export * from "./users";
 export * from "./videos";
 export * from "./widgets";
--- a/enjoy/src/renderer/components/medias/index.ts
+++ b/enjoy/src/renderer/components/medias/index.ts
@@ -6,7 +6,6 @@ export * from "./media-current-recording";
 export * from "./media-recorder";
 export * from "./media-transcription";
 export * from "./media-transcription-read-button";
-export * from "./media-transcription-form";
 export * from "./media-transcription-generate-button";
 export * from "./media-player";
 export * from "./media-provider";
--- a/enjoy/src/renderer/components/medias/media-caption.tsx
+++ b/enjoy/src/renderer/components/medias/media-caption.tsx
@@ -37,9 +37,10 @@ export const MediaCaption = () => {
    editingRegion,
    setEditingRegion,
    setTranscriptionDraft,
-    ipaMappings,
  } = useContext(MediaPlayerProviderContext);
-  const { EnjoyApp, learningLanguage } = useContext(AppSettingsProviderContext);
+  const { EnjoyApp, learningLanguage, ipaMappings } = useContext(
+    AppSettingsProviderContext
+  );
  const [activeIndex, setActiveIndex] = useState<number>(0);
  const [selectedIndices, setSelectedIndices] = useState<number[]>([]);
  const [multiSelecting, setMultiSelecting] = useState<boolean>(false);
@@ -366,6 +367,7 @@ export const MediaCaption = () => {
        >
          <Caption
            caption={caption}
+            language={transcription.language}
            selectedIndices={selectedIndices}
            currentSegmentIndex={currentSegmentIndex}
            activeIndex={activeIndex}
@@ -428,7 +430,9 @@ export const MediaCaption = () => {
                    t.timeline.map((s) => s.text).join("")
                  );
                  return `${word.text}(${
-                    learningLanguage.startsWith("en")
+                    (transcription.language || learningLanguage).startsWith(
+                      "en"
+                    )
                      ? convertWordIpaToNormal(ipas, {
                          mappings: ipaMappings,
                        }).join("")
@@ -475,6 +479,7 @@ export const MediaCaption = () => {

 export const Caption = (props: {
  caption: TimelineEntry;
+  language?: string;
  selectedIndices?: number[];
  currentSegmentIndex: number;
  activeIndex?: number;
@@ -482,6 +487,11 @@ export const Caption = (props: {
  displayNotes?: boolean;
  onClick?: (index: number) => void;
 }) => {
+  const { currentNotes } = useContext(MediaPlayerProviderContext);
+  const { learningLanguage, ipaMappings } = useContext(
+    AppSettingsProviderContext
+  );
+  const notes = currentNotes.filter((note) => note.parameters?.quoteIndices);
  const {
    caption,
    selectedIndices = [],
@@ -491,16 +501,14 @@ export const Caption = (props: {
    displayNotes,
    onClick,
  } = props;
+  const language = props.language || learningLanguage;

-  const { currentNotes, ipaMappings } = useContext(MediaPlayerProviderContext);
-  const { learningLanguage } = useContext(AppSettingsProviderContext);
-  const notes = currentNotes.filter((note) => note.parameters?.quoteIndices);
  const [notedquoteIndices, setNotedquoteIndices] = useState<number[]>([]);

  let words = caption.text.split(" ");
  const ipas = caption.timeline.map((w) =>
    w.timeline.map((t) =>
-      learningLanguage.startsWith("en")
+      language.startsWith("en")
        ? convertWordIpaToNormal(
            t.timeline.map((s) => s.text),
            { mappings: ipaMappings }
--- a/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx
+++ b/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx
@@ -40,8 +40,8 @@ const SelectedWords = (props: {
 }) => {
  const { selectedIndices, caption } = props;

-  const { transcription, ipaMappings } = useContext(MediaPlayerProviderContext);
-  const { learningLanguage } = useContext(AppSettingsProviderContext);
+  const { transcription } = useContext(MediaPlayerProviderContext);
+  const { learningLanguage, ipaMappings } = useContext(AppSettingsProviderContext);

  const word = selectedIndices
    .map((index) => caption.timeline[index]?.text || "")
--- a/enjoy/src/renderer/components/medias/media-loading-modal.tsx
+++ b/enjoy/src/renderer/components/medias/media-loading-modal.tsx
@@ -18,6 +18,7 @@ import {
 import { CheckCircleIcon, LoaderIcon, XCircleIcon } from "lucide-react";
 import { t } from "i18next";
 import { useNavigate } from "react-router-dom";
+import { TranscriptionCreateForm } from "../transcriptions";

 export const MediaLoadingModal = () => {
  const navigate = useNavigate();
@@ -35,7 +36,7 @@ export const MediaLoadingModal = () => {
  return (
    <AlertDialog open={!decoded || !Boolean(transcription?.result?.timeline)}>
      <AlertDialogOverlay className="" />
-      <AlertDialogContent className="z-[100]">
+      <AlertDialogContent className="">
        <AlertDialogHeader>
          <AlertDialogTitle>{t("preparingAudio")}</AlertDialogTitle>
          <AlertDialogDescription>
@@ -43,81 +44,54 @@ export const MediaLoadingModal = () => {
          </AlertDialogDescription>
        </AlertDialogHeader>

-        <div className="py-4">
-          {decoded ? (
-            <div className="mb-4 flex items-center space-x-4">
-              <CheckCircleIcon className="w-4 h-4 text-green-500" />
-              <span>{t("waveformIsDecoded")}</span>
-            </div>
-          ) : decodeError ? (
-            <div className="mb-4 flex items-center space-x-4">
-              <div className="w-4 h-4">
-                <XCircleIcon className="w-4 h-4 text-destructive" />
-              </div>
-              <div className="select-text">
-                <div className="mb-2">{decodeError}</div>
-                <div className="text-sm text-muted-foreground">
-                  {t("failedToDecodeWaveform")}:{" "}
-                  <span className="break-all ">{media?.src}</span>
-                </div>
-              </div>
-            </div>
-          ) : (
-            <div className="mb-4 flex items-center space-x-4">
-              <LoaderIcon className="w-4 h-4 animate-spin" />
-              <span>{t("decodingWaveform")}</span>
-            </div>
-          )}
-
-          {!transcription ? (
-            <div className="flex items-center space-x-4">
-              <LoaderIcon className="w-4 h-4 animate-spin" />
-              <span>{t("loadingTranscription")}</span>
-            </div>
-          ) : transcription.result?.timeline ? (
+        {decoded ? (
+          transcription?.result?.timeline ? (
            <div className="flex items-center space-x-4">
              <CheckCircleIcon className="w-4 h-4 text-green-500" />
              <span>{t("transcribedSuccessfully")}</span>
            </div>
-          ) : transcribing ? (
-            <div className="">
-              <div className="flex items-center space-x-4 mb-2">
-                <PingPoint colorClassName="bg-yellow-500" />
-                <span>{t("transcribing")}</span>
-              </div>
-              {whisperConfig.service === "local" && (
-                <Progress value={transcribingProgress} />
-              )}
-            </div>
          ) : (
-            <div className="flex items-center space-x-4">
-              <PingPoint colorClassName="bg-muted" />
-              <div className="inline">
-                <span>{t("notTranscribedYet")}</span>
-                {decoded && (
-                  <Button asChild className="ml-4" size="sm">
-                    <a
-                      className="cursor-pointer"
-                      onClick={() =>
-                        generateTranscription({
-                          originalText: "",
-                        })
-                      }
-                    >
-                      {t("regenerate")}
-                    </a>
-                  </Button>
-                )}
+            <TranscriptionCreateForm
+              onSubmit={(data) => {
+                generateTranscription({
+                  originalText: data.text,
+                  language: data.language,
+                  service: data.service as WhisperConfigType["service"],
+                });
+              }}
+              onCancel={() => navigate(-1)}
+              transcribing={transcribing}
+              transcribingProgress={transcribingProgress}
+            />
+          )
+        ) : (
+          <>
+            {decodeError ? (
+              <div className="mb-4 flex items-center space-x-4">
+                <div className="w-4 h-4">
+                  <XCircleIcon className="w-4 h-4 text-destructive" />
+                </div>
+                <div className="select-text">
+                  <div className="mb-2">{decodeError}</div>
+                  <div className="text-sm text-muted-foreground">
+                    {t("failedToDecodeWaveform")}:{" "}
+                    <span className="break-all ">{media?.src}</span>
+                  </div>
+                </div>
              </div>
-            </div>
-          )}
-        </div>
-
-        <AlertDialogFooter>
-          <Button variant="secondary" onClick={() => navigate(-1)}>
-            {t("cancel")}
-          </Button>
-        </AlertDialogFooter>
+            ) : (
+              <div className="mb-4 flex items-center space-x-4">
+                <LoaderIcon className="w-4 h-4 animate-spin" />
+                <span>{t("decodingWaveform")}</span>
+              </div>
+            )}
+            <AlertDialogFooter>
+              <Button variant="secondary" onClick={() => navigate(-1)}>
+                {t("cancel")}
+              </Button>
+            </AlertDialogFooter>
+          </>
+        )}
      </AlertDialogContent>
    </AlertDialog>
  );
--- a/enjoy/src/renderer/components/medias/media-transcription-form.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-form.tsx
@@ -1,127 +0,0 @@
-import { MediaPlayerProviderContext } from "@renderer/context";
-import {
-  AlertDialog,
-  AlertDialogAction,
-  AlertDialogCancel,
-  AlertDialogContent,
-  AlertDialogDescription,
-  AlertDialogFooter,
-  AlertDialogHeader,
-  AlertDialogTitle,
-  AlertDialogTrigger,
-  Button,
-  Dialog,
-  DialogClose,
-  DialogContent,
-  DialogFooter,
-  DialogHeader,
-  DialogTitle,
-  DialogTrigger,
-  Textarea,
-  toast,
-} from "@renderer/components/ui";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
-import { t } from "i18next";
-import { useContext, useState } from "react";
-import { LoaderIcon } from "lucide-react";
-
-export const MediaTranscriptionForm = (props: {
-  children?: React.ReactNode;
-}) => {
-  const [open, setOpen] = useState(false);
-
-  return (
-    <Dialog open={open} onOpenChange={setOpen}>
-      <DialogTrigger asChild>
-        {props.children ? (
-          props.children
-        ) : (
-          <Button variant="outline" size="sm">
-            <span className="capitalize">{t("edit")}</span>
-          </Button>
-        )}
-      </DialogTrigger>
-      <DialogContent className="max-w-screen-sm xl:max-w-screen-md">
-        <TranscriptionForm setOpen={setOpen} />
-      </DialogContent>
-    </Dialog>
-  );
-};
-
-export const TranscriptionForm = (props: {
-  setOpen: (value: boolean) => void;
-}) => {
-  const { setOpen } = props;
-  const [submiting, setSubmiting] = useState(false);
-  const { transcription, generateTranscription } = useContext(
-    MediaPlayerProviderContext
-  );
-  const [content, setContent] = useState<string>(
-    transcription.result.timeline.map((t: TimelineEntry) => t.text).join("\n\n")
-  );
-
-  const handleSave = async () => {
-    setSubmiting(true);
-    try {
-      await generateTranscription({ originalText: content });
-      setOpen(false);
-    } catch (e) {
-      toast.error(e.message);
-    }
-
-    setSubmiting(false);
-  };
-
-  return (
-    <>
-      <DialogHeader>
-        <DialogTitle>{t("editTranscription")}</DialogTitle>
-      </DialogHeader>
-      <div>
-        <Textarea
-          disabled={submiting}
-          className="h-96 text-lg font-serif resize-none"
-          value={content}
-          onChange={(e) => setContent(e.target.value)}
-        />
-      </div>
-      <DialogFooter>
-        <DialogClose asChild>
-          <Button disabled={submiting} variant="secondary">
-            {t("cancel")}
-          </Button>
-        </DialogClose>
-
-        <AlertDialog>
-          <AlertDialogTrigger asChild>
-            <Button disabled={submiting}>
-              {submiting && <LoaderIcon className="animate-spin w-4 mr-2" />}
-              {t("save")}
-            </Button>
-          </AlertDialogTrigger>
-          <AlertDialogContent>
-            <AlertDialogHeader>
-              <AlertDialogTitle>{t("saveTranscription")}</AlertDialogTitle>
-              <AlertDialogDescription>
-                {t("areYouSureToSaveTranscription")}
-              </AlertDialogDescription>
-            </AlertDialogHeader>
-            <AlertDialogFooter>
-              <AlertDialogCancel disabled={submiting}>
-                {t("cancel")}
-              </AlertDialogCancel>
-              <AlertDialogAction asChild>
-                <Button disabled={submiting} onClick={handleSave}>
-                  {submiting && (
-                    <LoaderIcon className="animate-spin w-4 mr-2" />
-                  )}
-                  {t("save")}
-                </Button>
-              </AlertDialogAction>
-            </AlertDialogFooter>
-          </AlertDialogContent>
-        </AlertDialog>
-      </DialogFooter>
-    </>
-  );
-};
--- a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
@@ -1,28 +1,32 @@
-import { useContext, useRef, useState } from "react";
+import { useContext, useState } from "react";
 import { MediaPlayerProviderContext } from "@renderer/context";
 import { t } from "i18next";
 import {
  Button,
  AlertDialog,
  AlertDialogTrigger,
-  AlertDialogFooter,
  AlertDialogHeader,
  AlertDialogContent,
  AlertDialogTitle,
  AlertDialogDescription,
-  AlertDialogCancel,
-  AlertDialogAction,
 } from "@renderer/components/ui";
 import { LoaderIcon } from "lucide-react";
+import { TranscriptionCreateForm } from "../transcriptions";

 export const MediaTranscriptionGenerateButton = (props: {
  children: React.ReactNode;
 }) => {
-  const { media, generateTranscription, transcribing, transcription } =
-    useContext(MediaPlayerProviderContext);
+  const {
+    media,
+    generateTranscription,
+    transcribing,
+    transcription,
+    transcribingProgress,
+  } = useContext(MediaPlayerProviderContext);
+  const [open, setOpen] = useState(false);

  return (
-    <AlertDialog>
+    <AlertDialog open={open} onOpenChange={setOpen}>
      <AlertDialogTrigger disabled={transcribing} asChild>
        {props.children ? (
          props.children
@@ -50,18 +54,20 @@ export const MediaTranscriptionGenerateButton = (props: {
            })}
          </AlertDialogDescription>
        </AlertDialogHeader>
-        <AlertDialogFooter>
-          <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
-          <AlertDialogAction
-            onClick={() =>
-              generateTranscription({
-                originalText: "",
-              })
-            }
-          >
-            {t("transcribe")}
-          </AlertDialogAction>
-        </AlertDialogFooter>
+
+        <TranscriptionCreateForm
+          onCancel={() => setOpen(false)}
+          onSubmit={(data) => {
+            generateTranscription({
+              originalText: data.text,
+              language: data.language,
+              service: data.service as WhisperConfigType["service"],
+            });
+            setOpen(false);
+          }}
+          transcribing={transcribing}
+          transcribingProgress={transcribingProgress}
+        />
      </AlertDialogContent>
    </AlertDialog>
  );
--- a/enjoy/src/renderer/components/medias/media-transcription.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription.tsx
@@ -24,9 +24,9 @@ import {
 import { AlignmentResult } from "echogarden/dist/api/API.d.js";
 import { formatDuration } from "@renderer/lib/utils";
 import {
-  MediaTranscriptionForm,
  MediaTranscriptionReadButton,
  MediaTranscriptionGenerateButton,
+  TranscriptionEditButton,
 } from "@renderer/components";

 export const MediaTranscription = (props: { display?: boolean }) => {
@@ -157,11 +157,11 @@ export const MediaTranscription = (props: { display?: boolean }) => {
                  </MediaTranscriptionGenerateButton>
                </DropdownMenuItem>
                <DropdownMenuItem asChild>
-                  <MediaTranscriptionForm>
+                  <TranscriptionEditButton>
                    <Button variant="ghost" className="block w-full">
                      {t("edit")}
                    </Button>
-                  </MediaTranscriptionForm>
+                  </TranscriptionEditButton>
                </DropdownMenuItem>
              </DropdownMenuContent>
            </DropdownMenu>
--- a/enjoy/src/renderer/components/notes/note-segment.tsx
+++ b/enjoy/src/renderer/components/notes/note-segment.tsx
@@ -1,6 +1,8 @@
 import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
-import { useState } from "react";
+import { useContext, useState } from "react";
 import { WavesurferPlayer } from "@/renderer/components/misc";
+import { AppSettingsProviderContext } from "@/renderer/context";
+import { convertWordIpaToNormal } from "@/utils";

 export const NoteSemgent = (props: {
  segment: SegmentType;
@@ -8,12 +10,23 @@ export const NoteSemgent = (props: {
 }) => {
  const { segment, notes } = props;
  const caption: TimelineEntry = segment.caption;
+  const { learningLanguage, ipaMappings } = useContext(
+    AppSettingsProviderContext
+  );

  const [notedquoteIndices, setNotedquoteIndices] = useState<number[]>([]);

  let words = caption.text.split(" ");
+  const language = segment.target?.language || learningLanguage;
  const ipas = caption.timeline.map((w) =>
-    w.timeline.map((t) => t.timeline.map((s) => s.text))
+    w.timeline.map((t) =>
+      language.startsWith("en")
+        ? convertWordIpaToNormal(
+            t.timeline.map((s) => s.text),
+            { mappings: ipaMappings }
+          ).join("")
+        : t.text
+    )
  );

  if (words.length !== caption.timeline.length) {
--- a/enjoy/src/renderer/components/transcriptions/index.ts
+++ b/enjoy/src/renderer/components/transcriptions/index.ts
@@ -0,0 +1,2 @@
+export * from "./transcription-create-form";
+export * from "./transcription-edit-button";
--- a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
+++ b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
@@ -0,0 +1,233 @@
+import {
+  AISettingsProviderContext,
+  AppSettingsProviderContext,
+} from "@renderer/context";
+import { zodResolver } from "@hookform/resolvers/zod";
+import { useContext } from "react";
+import { useForm } from "react-hook-form";
+import { z } from "zod";
+import {
+  Button,
+  Form,
+  FormField,
+  FormItem,
+  FormLabel,
+  FormMessage,
+  Input,
+  PingPoint,
+  Progress,
+  Select,
+  SelectContent,
+  SelectItem,
+  SelectTrigger,
+  SelectValue,
+  Textarea,
+  toast,
+} from "@renderer/components/ui";
+import { t } from "i18next";
+import { LANGUAGES } from "@/constants";
+import { LoaderIcon } from "lucide-react";
+import { parseText } from "media-captions";
+
+const transcriptionSchema = z.object({
+  language: z.string(),
+  service: z.string(),
+  text: z.string().optional(),
+});
+
+export const TranscriptionCreateForm = (props: {
+  onSubmit: (data: z.infer<typeof transcriptionSchema>) => void;
+  onCancel?: () => void;
+  transcribing?: boolean;
+  transcribingProgress?: number;
+}) => {
+  const {
+    transcribing = false,
+    transcribingProgress = 0,
+    onSubmit,
+    onCancel,
+  } = props;
+  const { learningLanguage } = useContext(AppSettingsProviderContext);
+  const { whisperConfig } = useContext(AISettingsProviderContext);
+
+  const form = useForm<z.infer<typeof transcriptionSchema>>({
+    resolver: zodResolver(transcriptionSchema),
+    values: {
+      language: learningLanguage,
+      service: whisperConfig.service,
+      text: "",
+    },
+  });
+
+  const parseSubtitle = (file: File) => {
+    const fileType = file.name.split(".").pop();
+    return new Promise<string>((resolve, reject) => {
+      const reader = new FileReader();
+      reader.onload = async (e) => {
+        let text = e.target.result;
+        if (typeof text !== "string") {
+          reject(new Error("Failed to read file"));
+        }
+
+        const caption = await parseText(text as string, {
+          strict: false,
+          type: fileType as "srt" | "vtt",
+        });
+        if (caption.cues.length === 0) {
+          text = cleanSubtitleText(text as string);
+        } else {
+          text = caption.cues.map((cue) => cue.text).join("\n");
+        }
+
+        if (text.length === 0) {
+          reject(new Error("No text found in the file"));
+        }
+
+        // Remove all content inside `()`
+        text = text.replace(/\(.*?\)/g, "").trim();
+        resolve(text);
+      };
+
+      reader.onerror = (e) => {
+        reject(e);
+      };
+
+      reader.readAsText(file);
+    });
+  };
+
+  const cleanSubtitleText = (text: string) => {
+    // Remove all line starting with `#`
+    // Remove all timestamps like `00:00:00,000` or `00:00:00.000 --> 00:00:00.000`
+    // Remove all empty lines
+    // Remove all lines with only spaces
+    return text
+      .replace(
+        /(\d{2}:\d{2}:\d{2}[,\.]\d{3}(\s+-->\s+\d{2}:\d{2}:\d{2}[,\.]\d{3})?)\s+/g,
+        ""
+      )
+      .replace(/#.*\n/g, "")
+      .replace(/^\s*[\r\n]/gm, "")
+      .replace(/^\s+$/gm, "");
+  };
+
+  return (
+    <Form {...form}>
+      <form
+        onSubmit={form.handleSubmit(onSubmit)}
+        className="gap-4 grid w-full"
+      >
+        <FormField
+          control={form.control}
+          name="service"
+          render={({ field }) => (
+            <FormItem className="grid w-full items-center gap-1.5">
+              <FormLabel>{t("sttAiService")}</FormLabel>
+              <Select
+                disabled={transcribing}
+                value={field.value}
+                onValueChange={field.onChange}
+              >
+                <SelectTrigger>
+                  <SelectValue />
+                </SelectTrigger>
+                <SelectContent>
+                  <SelectItem value="local">{t("local")}</SelectItem>
+                  <SelectItem value="azure">{t("azureAi")}</SelectItem>
+                  <SelectItem value="cloudflare">
+                    {t("cloudflareAi")}
+                  </SelectItem>
+                  <SelectItem value="openai">OpenAI</SelectItem>
+                </SelectContent>
+              </Select>
+            </FormItem>
+          )}
+        />
+        <FormField
+          control={form.control}
+          name="language"
+          render={({ field }) => (
+            <FormItem className="grid w-full items-center gap-1.5">
+              <FormLabel>{t("language")}</FormLabel>
+              <Select
+                disabled={transcribing}
+                value={field.value}
+                onValueChange={field.onChange}
+              >
+                <SelectTrigger>
+                  <SelectValue />
+                </SelectTrigger>
+                <SelectContent>
+                  {LANGUAGES.map((language) => (
+                    <SelectItem key={language.code} value={language.code}>
+                      {language.name}
+                    </SelectItem>
+                  ))}
+                </SelectContent>
+              </Select>
+              <FormMessage />
+            </FormItem>
+          )}
+        />
+        <FormField
+          control={form.control}
+          name="text"
+          render={({ field }) => (
+            <FormItem className="grid w-full items-center gap-1.5">
+              <FormLabel>
+                {t("uploadTranscriptFile")}({t("optinal")})
+              </FormLabel>
+              <Input
+                disabled={transcribing}
+                type="file"
+                accept=".txt,.srt,.vtt"
+                onChange={async (event) => {
+                  const file = event.target.files[0];
+
+                  if (file) {
+                    parseSubtitle(file)
+                      .then((text) => {
+                        field.onChange(text);
+                      })
+                      .catch((error) => {
+                        toast.error(error.message);
+                      });
+                  } else {
+                    field.onChange("");
+                  }
+                }}
+              />
+              {field.value && (
+                <Textarea className="h-96" {...field} disabled={transcribing} />
+              )}
+              <FormMessage />
+            </FormItem>
+          )}
+        />
+        {transcribing && (
+          <div className="mb-4">
+            <div className="flex items-center space-x-4 mb-2">
+              <PingPoint colorClassName="bg-yellow-500" />
+              <span>{t("transcribing")}</span>
+            </div>
+            {whisperConfig.service === "local" && (
+              <Progress value={transcribingProgress} />
+            )}
+          </div>
+        )}
+
+        <div className="flex justify-end space-x-4">
+          {onCancel && (
+            <Button type="reset" variant="outline" onClick={onCancel}>
+              {t("cancel")}
+            </Button>
+          )}
+          <Button disabled={transcribing} type="submit" variant="default">
+            {transcribing && <LoaderIcon className="animate-spin w-4 mr-2" />}
+            {t("transcribe")}
+          </Button>
+        </div>
+      </form>
+    </Form>
+  );
+};
--- a/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
+++ b/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
@@ -0,0 +1,115 @@
+import { MediaPlayerProviderContext } from "@renderer/context";
+import {
+  AlertDialog,
+  AlertDialogAction,
+  AlertDialogCancel,
+  AlertDialogContent,
+  AlertDialogDescription,
+  AlertDialogFooter,
+  AlertDialogHeader,
+  AlertDialogTitle,
+  AlertDialogTrigger,
+  Button,
+  Dialog,
+  DialogClose,
+  DialogContent,
+  DialogFooter,
+  DialogHeader,
+  DialogTitle,
+  DialogTrigger,
+  Textarea,
+  toast,
+} from "@renderer/components/ui";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { t } from "i18next";
+import { useContext, useState } from "react";
+import { LoaderIcon } from "lucide-react";
+
+export const TranscriptionEditButton = (props: {
+  children?: React.ReactNode;
+}) => {
+  const [open, setOpen] = useState(false);
+  const [submiting, setSubmiting] = useState(false);
+  const { transcription, generateTranscription } = useContext(
+    MediaPlayerProviderContext
+  );
+  const [content, setContent] = useState<string>(
+    transcription.result.timeline.map((t: TimelineEntry) => t.text).join("\n\n")
+  );
+
+  const handleSave = async () => {
+    setSubmiting(true);
+    try {
+      await generateTranscription({ originalText: content });
+      setOpen(false);
+    } catch (e) {
+      toast.error(e.message);
+    }
+
+    setSubmiting(false);
+  };
+
+  return (
+    <Dialog open={open} onOpenChange={setOpen}>
+      <DialogTrigger asChild>
+        {props.children ? (
+          props.children
+        ) : (
+          <Button variant="outline" size="sm">
+            <span className="capitalize">{t("edit")}</span>
+          </Button>
+        )}
+      </DialogTrigger>
+      <DialogContent className="max-w-screen-sm xl:max-w-screen-md">
+        <DialogHeader>
+          <DialogTitle>{t("editTranscription")}</DialogTitle>
+        </DialogHeader>
+        <div>
+          <Textarea
+            disabled={submiting}
+            className="h-96 text-lg font-serif resize-none"
+            value={content}
+            onChange={(e) => setContent(e.target.value)}
+          />
+        </div>
+        <DialogFooter>
+          <DialogClose asChild>
+            <Button disabled={submiting} variant="secondary">
+              {t("cancel")}
+            </Button>
+          </DialogClose>
+
+          <AlertDialog>
+            <AlertDialogTrigger asChild>
+              <Button disabled={submiting}>
+                {submiting && <LoaderIcon className="animate-spin w-4 mr-2" />}
+                {t("save")}
+              </Button>
+            </AlertDialogTrigger>
+            <AlertDialogContent>
+              <AlertDialogHeader>
+                <AlertDialogTitle>{t("saveTranscription")}</AlertDialogTitle>
+                <AlertDialogDescription>
+                  {t("areYouSureToSaveTranscription")}
+                </AlertDialogDescription>
+              </AlertDialogHeader>
+              <AlertDialogFooter>
+                <AlertDialogCancel disabled={submiting}>
+                  {t("cancel")}
+                </AlertDialogCancel>
+                <AlertDialogAction asChild>
+                  <Button disabled={submiting} onClick={handleSave}>
+                    {submiting && (
+                      <LoaderIcon className="animate-spin w-4 mr-2" />
+                    )}
+                    {t("save")}
+                  </Button>
+                </AlertDialogAction>
+              </AlertDialogFooter>
+            </AlertDialogContent>
+          </AlertDialog>
+        </DialogFooter>
+      </DialogContent>
+    </Dialog>
+  );
+};
--- a/enjoy/src/renderer/context/app-settings-provider.tsx
+++ b/enjoy/src/renderer/context/app-settings-provider.tsx
@@ -1,5 +1,5 @@
 import { createContext, useEffect, useState } from "react";
-import { WEB_API_URL, LANGUAGES } from "@/constants";
+import { WEB_API_URL, LANGUAGES, IPA_MAPPINGS } from "@/constants";
 import { Client } from "@/api";
 import i18n from "@renderer/i18n";
 import ahoy from "ahoy.js";
@@ -26,6 +26,8 @@ type AppSettingsProviderState = {
  setProxy?: (config: ProxyConfigType) => Promise<void>;
  cable?: Consumer;
  ahoy?: typeof ahoy;
+  // remote config
+  ipaMappings?: { [key: string]: string };
 };

 const initialState: AppSettingsProviderState = {
@@ -53,6 +55,9 @@ export const AppSettingsProvider = ({
  const [learningLanguage, setLearningLanguage] = useState<string>("en-US");
  const [proxy, setProxy] = useState<ProxyConfigType>();
  const EnjoyApp = window.__ENJOY_APP__;
+  const [ipaMappings, setIpaMappings] = useState<{ [key: string]: string }>(
+    IPA_MAPPINGS
+  );

  useEffect(() => {
    fetchVersion();
@@ -82,6 +87,14 @@ export const AppSettingsProvider = ({
    });
  }, [apiUrl]);

+  useEffect(() => {
+    if (!webApi) return;
+
+    webApi.config("ipa_mappings").then((mappings) => {
+      if (mappings) setIpaMappings(mappings);
+    });
+  }, [webApi]);
+
  const fetchLanguages = async () => {
    const language = await EnjoyApp.settings.getLanguage();
    setLanguage(language as "en" | "zh-CN");
@@ -206,6 +219,7 @@ export const AppSettingsProvider = ({
        initialized: Boolean(user && libraryPath),
        ahoy,
        cable,
+        ipaMappings,
      }}
    >
      {children}
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -16,7 +16,6 @@ import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { toast } from "@renderer/components/ui";
 import { Tooltip } from "react-tooltip";
 import { debounce } from "lodash";
-import { IPA_MAPPINGS } from "@/constants";

 type MediaPlayerContextType = {
  layout: {
@@ -69,6 +68,7 @@ type MediaPlayerContextType = {
  generateTranscription: (params?: {
    originalText?: string;
    language?: string;
+    service?: WhisperConfigType["service"];
  }) => void;
  transcribing: boolean;
  transcribingProgress: number;
@@ -89,8 +89,6 @@ type MediaPlayerContextType = {
  // Segments
  currentSegment: SegmentType;
  createSegment: () => Promise<SegmentType | void>;
-  // remote config
-  ipaMappings: { [key: string]: string };
  getCachedSegmentIndex: () => Promise<number>;
  setCachedSegmentIndex: (index: number) => void;
 };
@@ -169,10 +167,6 @@ export const MediaPlayerProvider = ({
  const [transcriptionDraft, setTranscriptionDraft] =
    useState<TranscriptionType["result"]>();

-  const [ipaMappings, setIpaMappings] = useState<{ [key: string]: string }>(
-    IPA_MAPPINGS
-  );
-
  const {
    transcription,
    generateTranscription,
@@ -364,7 +358,7 @@ export const MediaPlayerProvider = ({
          );
          labels[index] = [
            labels[index] || "",
-            learningLanguage.startsWith("en")
+            (media?.language || learningLanguage).startsWith("en")
              ? convertIpaToNormal(phone.text.trim())
              : phone.text.trim(),
          ].join("");
@@ -575,10 +569,6 @@ export const MediaPlayerProvider = ({
  useEffect(() => {
    calculateHeight();

-    webApi.config("ipa_mappings").then((mappings) => {
-      if (mappings) setIpaMappings(mappings);
-    });
-
    EnjoyApp.window.onResize(() => {
      deboundeCalculateHeight();
    });
@@ -635,7 +625,6 @@ export const MediaPlayerProvider = ({
          createNote,
          currentSegment: segment,
          createSegment,
-          ipaMappings,
          getCachedSegmentIndex,
          setCachedSegmentIndex,
        }}
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -12,10 +12,8 @@ import { AlignmentResult } from "echogarden/dist/api/API.d.js";
 import { useAiCommand } from "./use-ai-command";

 export const useTranscribe = () => {
-  const { EnjoyApp, user, webApi, learningLanguage } = useContext(
-    AppSettingsProviderContext
-  );
-  const { whisperConfig, openai } = useContext(AISettingsProviderContext);
+  const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
+  const { openai } = useContext(AISettingsProviderContext);
  const { punctuateText } = useAiCommand();

  const transcode = async (src: string | Blob): Promise<string> => {
@@ -36,7 +34,8 @@ export const useTranscribe = () => {
      targetId?: string;
      targetType?: string;
      originalText?: string;
-      language?: string;
+      language: string;
+      service: WhisperConfigType["service"];
    }
  ): Promise<{
    engine: string;
@@ -45,12 +44,8 @@ export const useTranscribe = () => {
    originalText?: string;
  }> => {
    const url = await transcode(mediaSrc);
-    const {
-      targetId,
-      targetType,
-      originalText,
-      language = learningLanguage.split("-")[0],
-    } = params || {};
+    const { targetId, targetType, originalText, language, service } =
+      params || {};
    const blob = await (await fetch(url)).blob();

    let result;
@@ -59,19 +54,30 @@ export const useTranscribe = () => {
        engine: "original",
        model: "original",
      };
-    } else if (whisperConfig.service === "local") {
-      result = await transcribeByLocal(url);
-    } else if (whisperConfig.service === "cloudflare") {
+    } else if (service === "local") {
+      result = await transcribeByLocal(url, language);
+    } else if (service === "cloudflare") {
      result = await transcribeByCloudflareAi(blob);
-    } else if (whisperConfig.service === "openai") {
+    } else if (service === "openai") {
      result = await transcribeByOpenAi(blob);
-    } else if (whisperConfig.service === "azure") {
-      result = await transcribeByAzureAi(blob, { targetId, targetType });
+    } else if (service === "azure") {
+      result = await transcribeByAzureAi(blob, language, {
+        targetId,
+        targetType,
+      });
    } else {
      throw new Error(t("whisperServiceNotSupported"));
    }

    let transcript = originalText || result.text;
+
+    // Remove all content inside `()`, `[]`, `{}` and trim the text
+    transcript = transcript
+      .replace(/\(.*?\)/g, "")
+      .replace(/\[.*?\]/g, "")
+      .replace(/\{.*?\}/g, "")
+      .trim();
+
    // if the transcript does not contain any punctuation, use AI command to add punctuation
    if (!transcript.match(/\w[.,!?](\s|$)/)) {
      try {
@@ -96,12 +102,13 @@ export const useTranscribe = () => {
    };
  };

-  const transcribeByLocal = async (url: string) => {
+  const transcribeByLocal = async (url: string, language?: string) => {
    const res = await EnjoyApp.whisper.transcribe(
      {
        file: url,
      },
      {
+        language,
        force: true,
        extra: ["--prompt", `"Hello! Welcome to listen to this audio."`],
      }
@@ -157,6 +164,7 @@ export const useTranscribe = () => {

  const transcribeByAzureAi = async (
    blob: Blob,
+    language: string,
    params?: {
      targetId?: string;
      targetType?: string;
@@ -172,7 +180,7 @@ export const useTranscribe = () => {
      new File([blob], "audio.wav")
    );
    // setting the recognition language to learning language, such as 'en-US'.
-    config.speechRecognitionLanguage = learningLanguage;
+    config.speechRecognitionLanguage = language;
    config.requestWordLevelTimestamps();
    config.outputFormat = sdk.OutputFormat.Detailed;

--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -11,7 +11,9 @@ import { MAGIC_TOKEN_REGEX, END_OF_SENTENCE_REGEX } from "@/constants";

 export const useTranscriptions = (media: AudioType | VideoType) => {
  const { whisperConfig } = useContext(AISettingsProviderContext);
-  const { EnjoyApp, webApi } = useContext(AppSettingsProviderContext);
+  const { EnjoyApp, webApi, learningLanguage } = useContext(
+    AppSettingsProviderContext
+  );
  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
  const [transcription, setTranscription] = useState<TranscriptionType>(null);
  const { transcribe } = useTranscribe();
@@ -55,8 +57,13 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
  const generateTranscription = async (params?: {
    originalText?: string;
    language?: string;
+    service?: WhisperConfigType["service"];
  }) => {
-    let { originalText, language } = params || {};
+    let {
+      originalText,
+      language = learningLanguage,
+      service = whisperConfig.service,
+    } = params || {};
    if (originalText === undefined) {
      if (transcription?.targetId === media.id) {
        originalText = transcription.result?.originalText;
@@ -76,6 +83,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
        targetType: media.mediaType,
        originalText,
        language,
+        service,
      });

      let timeline: TimelineEntry[] = [];
@@ -169,7 +177,20 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
        },
        engine,
        model,
+        language,
      });
+
+      if (media.language !== language) {
+        if (media.mediaType === "Video") {
+          await EnjoyApp.videos.update(media.id, {
+            language,
+          });
+        } else {
+          await EnjoyApp.audios.update(media.id, {
+            language,
+          });
+        }
+      }
    } catch (err) {
      toast.error(err.message);
    }
@@ -234,12 +255,12 @@ export const useTranscriptions = (media: AudioType | VideoType) => {

    addDblistener(onTransactionUpdate);

-    if (
-      transcription.state == "pending" ||
-      !transcription.result?.["timeline"]
-    ) {
-      findOrGenerateTranscription();
-    }
+    // if (
+    //   transcription.state == "pending" ||
+    //   !transcription.result?.["timeline"]
+    // ) {
+    //   findOrGenerateTranscription();
+    // }

    if (whisperConfig.service === "local") {
      EnjoyApp.whisper.onProgress((_, p: number) => {
--- a/enjoy/src/types/audio.d.ts
+++ b/enjoy/src/types/audio.d.ts
@@ -4,6 +4,7 @@ type AudioType = {
  source: string;
  name: string;
  filename: string;
+  language?: string;
  description?: string;
  src?: string;
  coverUrl?: string;
--- a/enjoy/src/types/enjoy-app.d.ts
+++ b/enjoy/src/types/enjoy-app.d.ts
@@ -264,6 +264,7 @@ type EnjoyAppType = {
        blob?: { type: string; arrayBuffer: ArrayBuffer };
      },
      options?: {
+        language?: string;
        force?: boolean;
        extra?: string[];
      }
--- a/enjoy/src/types/segment.d.ts
+++ b/enjoy/src/types/segment.d.ts
@@ -2,6 +2,7 @@ type SegmentType = {
  id: string;
  targetId: string;
  targetType: string;
+  target: AudioType | VideoType;
  caption: TimelineEntry;
  audio?: AudioType;
  video?: VideoType;
@@ -14,7 +15,7 @@ type SegmentType = {
  isSynced?: boolean;
  isUploaded?: boolean;
  syncedAt?: Date;
-  uploadedAt?: Date
-  updatedAt: Date
-  createdAt: Date
-};
+  uploadedAt?: Date;
+  updatedAt: Date;
+  createdAt: Date;
+};
--- a/enjoy/src/types/transcription.d.ts
+++ b/enjoy/src/types/transcription.d.ts
@@ -6,6 +6,7 @@ type TranscriptionType = {
  state: "pending" | "processing" | "finished";
  engine: string;
  model: string;
+  language?: string;
  result: AlignmentResult & { original?: string };
 };

--- a/enjoy/src/types/video.d.ts
+++ b/enjoy/src/types/video.d.ts
@@ -4,15 +4,15 @@ type VideoType = {
  source: string;
  name: string;
  filename: string;
+  language?: string;
  description?: string;
-  filename?: string;
+  src?: string;
  coverUrl?: string;
  md5: string;
-  src?: string;
  metadata?: Ffmpeg.FfprobeData;
  duration?: number;
-  transcribed: boolean;
-  transcribing: boolean;
+  transcribed?: boolean;
+  transcribing?: boolean;
  recordingsCount?: number;
  recordingsDuration?: number;
  isUploaded?: boolean;