diff --git a/enjoy/src/i18n/en.json b/enjoy/src/i18n/en.json
index d42e7edc..627a3822 100644
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -611,8 +611,11 @@
   "assessing": "Assessing",
   "assessedSuccessfully": "Assessed successfully",
   "optinal": "Optional",
-  "uploadTranscriptFile": "Upload transcript file(.txt/.srt/.vtt)",
+  "uploadTranscriptFile": "Upload transcript file",
+  "uploadTranscriptFileDescription": "Optional. Support formats: txt/srt/vtt.",
   "onlyTextFileIsSupported": "Only text file is supported",
+  "isolateVoice": "Isolate voice",
+  "isolateVoiceDescription": "Isolates voice from any music or background ambience. More accurate but slower",
   "sortBy": "Sort by",
   "createdAtDesc": "Created at desc",
   "createdAtAsc": "Created at asc",
@@ -627,5 +630,6 @@
   "search": "Search",
   "noData": "No data",
   "selectedFiles": "Selected files",
-  "moreOptions": "More options"
+  "moreOptions": "More options",
+  "lessOptions": "Less options"
 }
diff --git a/enjoy/src/i18n/zh-CN.json b/enjoy/src/i18n/zh-CN.json
index 549da3c6..adb6a9c0 100644
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -611,8 +611,11 @@
   "assessing": "正在评估",
   "assessedSuccessfully": "评估成功",
   "optinal": "可选",
-  "uploadTranscriptFile": "上传字幕文件(.txt/.srt/.vtt)",
+  "uploadTranscriptFile": "上传字幕文件",
+  "uploadTranscriptFileDescription": "可选。支持字幕文件格式: txt/srt/vtt。",
   "onlyTextFileIsSupported": "仅支持文本文件",
+  "isolateVoice": "提取人声",
+  "isolateVoiceDescription": "将人声从音乐、背景音中隔离，字幕对齐会更准确，但耗时较久。",
   "sortBy": "排序",
   "createdAtDesc": "创建时间降序",
   "createdAtAsc": "创建时间升序",
@@ -627,5 +630,6 @@
   "search": "搜索",
   "noData": "没有数据",
   "selectedFiles": "已选中文件",
-  "moreOptions": "更多选项"
+  "moreOptions": "更多选项",
+  "lessOptions": "更少选项"
 }
diff --git a/enjoy/src/main/window.ts b/enjoy/src/main/window.ts
index 59fb88da..785d30f0 100644
--- a/enjoy/src/main/window.ts
+++ b/enjoy/src/main/window.ts
@@ -431,6 +431,28 @@ ${log}
     return { action: "allow" };
   });
 
+  // Capture stderr & stdout and send them to renderer
+  const originalStderrWrite = process.stderr.write.bind(process.stderr);
+  process.stderr.write = (chunk, encoding?, callback?) => {
+    // Remove ANSI color codes
+    const output = chunk
+      .toString()
+      .replace(/\x1B\[([0-9]{1,3}(;[0-9]{1,2};?)?)?[mGK]/g, "");
+    mainWindow.webContents.send("app-on-cmd-output", output);
+
+    return originalStderrWrite(chunk, encoding, callback);
+  };
+  const originalStdoutWrite = process.stdout.write.bind(process.stdout);
+  process.stdout.write = (chunk, encoding?, callback?) => {
+    // Remove ANSI color codes
+    const output = chunk
+      .toString()
+      .replace(/\x1B\[([0-9]{1,3}(;[0-9]{1,2};?)?)?[mGK]/g, "");
+    mainWindow.webContents.send("app-on-cmd-output", output);
+
+    return originalStdoutWrite(chunk, encoding, callback);
+  };
+
   // and load the index.html of the app.
   if (MAIN_WINDOW_VITE_DEV_SERVER_URL) {
     mainWindow.loadURL(MAIN_WINDOW_VITE_DEV_SERVER_URL);
diff --git a/enjoy/src/preload.ts b/enjoy/src/preload.ts
index 7366a292..0a2047e4 100644
--- a/enjoy/src/preload.ts
+++ b/enjoy/src/preload.ts
@@ -2,6 +2,8 @@
 // https://www.electronjs.org/docs/latest/tutorial/process-model#preload-scripts
 import { contextBridge, ipcRenderer, IpcRendererEvent } from "electron";
 import { version } from "../package.json";
+import { callback } from "chart.js/dist/helpers/helpers.core";
+import { remove } from "lodash";
 
 contextBridge.exposeInMainWorld("__ENJOY_APP__", {
   app: {
@@ -35,6 +37,12 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
     createIssue: (title: string, body: string) => {
       return ipcRenderer.invoke("app-create-issue", title, body);
     },
+    onCmdOutput: (callback: (event: IpcRendererEvent, data: string) => void) => {
+      ipcRenderer.on("app-on-cmd-output", callback);
+    },
+    removeCmdOutputListeners: () => {
+      ipcRenderer.removeAllListeners("app-on-cmd-output");
+    },
     version,
   },
   window: {
diff --git a/enjoy/src/renderer/components/medias/media-current-recording.tsx b/enjoy/src/renderer/components/medias/media-current-recording.tsx
index 917ea7d8..50433947 100644
--- a/enjoy/src/renderer/components/medias/media-current-recording.tsx
+++ b/enjoy/src/renderer/components/medias/media-current-recording.tsx
@@ -64,7 +64,7 @@ export const MediaCurrentRecording = () => {
     currentTime: mediaCurrentTime,
   } = useContext(MediaPlayerProviderContext);
   const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext);
-  const { enabled, currentHotkeys } = useContext(
+  const { currentHotkeys } = useContext(
     HotKeysSettingsProviderContext
   );
   const [player, setPlayer] = useState(null);
diff --git a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
index 01310c6c..7c3bd3e6 100644
--- a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
@@ -9,6 +9,7 @@ import {
   AlertDialogContent,
   AlertDialogTitle,
   AlertDialogDescription,
+  toast,
 } from "@renderer/components/ui";
 import { LoaderIcon } from "lucide-react";
 import { TranscriptionCreateForm } from "../transcriptions";
@@ -22,6 +23,7 @@ export const MediaTranscriptionGenerateButton = (props: {
     transcribing,
     transcription,
     transcribingProgress,
+    transcribingOutput,
   } = useContext(MediaPlayerProviderContext);
   const [open, setOpen] = useState(false);
 
@@ -62,11 +64,18 @@ export const MediaTranscriptionGenerateButton = (props: {
               originalText: data.text,
               language: data.language,
               service: data.service as WhisperConfigType["service"],
-            });
-            setOpen(false);
+              isolate: data.isolate,
+            })
+              .then(() => {
+                setOpen(false);
+              })
+              .catch((e) => {
+                toast.error(e.message);
+              });
           }}
           transcribing={transcribing}
           transcribingProgress={transcribingProgress}
+          transcribingOutput={transcribingOutput}
         />
       </AlertDialogContent>
     </AlertDialog>
diff --git a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
index fabcfa22..80ff3b90 100644
--- a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
+++ b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
@@ -12,6 +12,7 @@ import {
   CollapsibleContent,
   CollapsibleTrigger,
   Form,
+  FormDescription,
   FormField,
   FormItem,
   FormLabel,
@@ -24,6 +25,7 @@ import {
   SelectItem,
   SelectTrigger,
   SelectValue,
+  Switch,
   Textarea,
   toast,
 } from "@renderer/components/ui";
@@ -36,18 +38,21 @@ const transcriptionSchema = z.object({
   language: z.string(),
   service: z.string(),
   text: z.string().optional(),
+  isolate: z.boolean().optional(),
 });
 
 export const TranscriptionCreateForm = (props: {
   onSubmit: (data: z.infer<typeof transcriptionSchema>) => void;
   originalText?: string;
   onCancel?: () => void;
-  transcribing?: boolean;
-  transcribingProgress?: number;
+  transcribing: boolean;
+  transcribingProgress: number;
+  transcribingOutput: string;
 }) => {
   const {
     transcribing = false,
     transcribingProgress = 0,
+    transcribingOutput,
     onSubmit,
     onCancel,
     originalText,
@@ -62,6 +67,7 @@ export const TranscriptionCreateForm = (props: {
       language: learningLanguage,
       service: whisperConfig.service,
       text: originalText,
+      isolate: false,
     },
   });
 
@@ -127,7 +133,7 @@ export const TranscriptionCreateForm = (props: {
           control={form.control}
           name="service"
           render={({ field }) => (
-            <FormItem className="grid w-full items-center gap-1.5">
+            <FormItem className="grid w-full items-center">
               <FormLabel>{t("sttAiService")}</FormLabel>
               <Select
                 disabled={transcribing}
@@ -153,7 +159,7 @@ export const TranscriptionCreateForm = (props: {
           control={form.control}
           name="language"
           render={({ field }) => (
-            <FormItem className="grid w-full items-center gap-1.5">
+            <FormItem className="grid w-full items-center">
               <FormLabel>{t("language")}</FormLabel>
               <Select
                 disabled={transcribing}
@@ -176,12 +182,12 @@ export const TranscriptionCreateForm = (props: {
           )}
         />
         <Collapsible open={collapsibleOpen} onOpenChange={setCollapsibleOpen}>
-          <CollapsibleContent>
+          <CollapsibleContent className="space-y-4 mb-4">
             <FormField
               control={form.control}
               name="text"
               render={({ field }) => (
-                <FormItem className="grid w-full items-center gap-1.5">
+                <FormItem className="grid w-full items-center">
                   <FormLabel>
                     {t("uploadTranscriptFile")}({t("optinal")})
                   </FormLabel>
@@ -205,6 +211,9 @@ export const TranscriptionCreateForm = (props: {
                       }
                     }}
                   />
+                  <FormDescription>
+                    {t("uploadTranscriptFileDescription")}
+                  </FormDescription>
                   {field.value != undefined && (
                     <>
                       <FormLabel>{t("transcript")}</FormLabel>
@@ -219,45 +228,92 @@ export const TranscriptionCreateForm = (props: {
                 </FormItem>
               )}
             />
+            <FormField
+              control={form.control}
+              name="isolate"
+              render={({ field }) => (
+                <FormItem className="grid w-full items-center">
+                  <FormLabel>{t("isolateVoice")}</FormLabel>
+                  <Switch
+                    checked={field.value}
+                    onCheckedChange={field.onChange}
+                    disabled={transcribing}
+                  />
+                  <FormDescription>
+                    {t("isolateVoiceDescription")}
+                  </FormDescription>
+                </FormItem>
+              )}
+            />
           </CollapsibleContent>
-          <div className="flex justify-center my-4">
+          <div className="flex justify-center">
             <CollapsibleTrigger asChild>
               <Button variant="ghost" size="sm">
-                <span className="">{t("moreOptions")}</span>
                 {collapsibleOpen ? (
-                  <ChevronUpIcon className="h-4 w-4" />
+                  <>
+                    <ChevronUpIcon className="h-4 w-4" />
+                    <span className="ml-2">{t("lessOptions")}</span>
+                  </>
                 ) : (
-                  <ChevronDownIcon className="h-4 w-4" />
+                  <>
+                    <ChevronDownIcon className="h-4 w-4" />
+                    <span className="ml-2">{t("moreOptions")}</span>
+                  </>
                 )}
               </Button>
             </CollapsibleTrigger>
           </div>
         </Collapsible>
 
-        {transcribing && form.watch("service") === "local" && (
-          <div className="mb-4">
-            <div className="flex items-center space-x-4 mb-2">
-              <PingPoint colorClassName="bg-yellow-500" />
-              <span>{t("transcribing")}</span>
-            </div>
-            {whisperConfig.service === "local" && (
-              <Progress value={transcribingProgress} />
-            )}
-          </div>
-        )}
+        <TranscribeProgress
+          service={form.watch("service")}
+          transcribing={transcribing}
+          transcribingProgress={transcribingProgress}
+          transcribingOutput={transcribingOutput}
+        />
 
         <div className="flex justify-end space-x-4">
-          {onCancel && (
+          {onCancel && !transcribing && (
             <Button type="reset" variant="outline" onClick={onCancel}>
               {t("cancel")}
             </Button>
           )}
           <Button disabled={transcribing} type="submit" variant="default">
             {transcribing && <LoaderIcon className="animate-spin w-4 mr-2" />}
-            {t("transcribe")}
+            {t("continue")}
           </Button>
         </div>
       </form>
     </Form>
   );
 };
+
+const TranscribeProgress = (props: {
+  service: string;
+  transcribing: boolean;
+  transcribingProgress: number;
+  transcribingOutput?: string;
+}) => {
+  const { service, transcribing, transcribingProgress, transcribingOutput } =
+    props;
+  if (!transcribing) return null;
+
+  return (
+    <div className="mb-4 space-y-2">
+      <div className="flex items-center space-x-4 mb-2">
+        <PingPoint colorClassName="bg-yellow-500" />
+        <span>{t("transcribing")}</span>
+      </div>
+      {service === "local" && transcribingProgress > 0 && (
+        <Progress value={transcribingProgress} />
+      )}
+      {transcribingOutput && (
+        <div className="max-w-full rounded-lg border bg-zinc-950 p-3 dark:bg-zinc-900 h-20 overflow-y-auto">
+          <code className="px-[0.3rem] py-[0.2rem] rounded text-muted-foreground font-mono text-xs break-words">
+            {transcribingOutput}
+          </code>
+        </div>
+      )}
+    </div>
+  );
+};
diff --git a/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx b/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
index b1f2774f..c6b32241 100644
--- a/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
+++ b/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
@@ -39,14 +39,12 @@ export const TranscriptionEditButton = (props: {
 
   const handleSave = async () => {
     setSubmiting(true);
-    try {
-      await generateTranscription({ originalText: content });
-      setOpen(false);
-    } catch (e) {
-      toast.error(e.message);
-    }
-
-    setSubmiting(false);
+    generateTranscription({ originalText: content })
+      .then(() => setOpen(false))
+      .catch((e) => {
+        toast.error(e.message);
+      })
+      .finally(() => setSubmiting(false));
   };
 
   return (
diff --git a/enjoy/src/renderer/context/media-player-provider.tsx b/enjoy/src/renderer/context/media-player-provider.tsx
index ddef181c..5c050db7 100644
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -69,9 +69,11 @@ type MediaPlayerContextType = {
     originalText?: string;
     language?: string;
     service?: WhisperConfigType["service"];
-  }) => void;
+    isolate?: boolean;
+  }) => Promise<void>;
   transcribing: boolean;
   transcribingProgress: number;
+  transcribingOutput: string;
   transcriptionDraft: TranscriptionType["result"];
   setTranscriptionDraft: (result: TranscriptionType["result"]) => void;
   // Recordings
@@ -172,6 +174,7 @@ export const MediaPlayerProvider = ({
     generateTranscription,
     transcribing,
     transcribingProgress,
+    transcribingOutput,
     abortGenerateTranscription,
   } = useTranscriptions(media);
 
@@ -611,6 +614,7 @@ export const MediaPlayerProvider = ({
           generateTranscription,
           transcribing,
           transcribingProgress,
+          transcribingOutput,
           transcriptionDraft,
           setTranscriptionDraft,
           isRecording,
diff --git a/enjoy/src/renderer/hooks/use-transcribe.tsx b/enjoy/src/renderer/hooks/use-transcribe.tsx
index 79d9e515..ad4f9ad7 100644
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -3,7 +3,7 @@ import {
   AISettingsProviderContext,
 } from "@renderer/context";
 import OpenAI from "openai";
-import { useContext } from "react";
+import { useContext, useState } from "react";
 import { t } from "i18next";
 import { AI_WORKER_ENDPOINT } from "@/constants";
 import * as sdk from "microsoft-cognitiveservices-speech-sdk";
@@ -15,6 +15,7 @@ export const useTranscribe = () => {
   const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
   const { openai } = useContext(AISettingsProviderContext);
   const { punctuateText } = useAiCommand();
+  const [output, setOutput] = useState<string>("");
 
   const transcode = async (src: string | Blob): Promise<string> => {
     if (src instanceof Blob) {
@@ -36,6 +37,7 @@ export const useTranscribe = () => {
       originalText?: string;
       language: string;
       service: WhisperConfigType["service"];
+      isolate?: boolean;
     }
   ): Promise<{
     engine: string;
@@ -45,8 +47,14 @@ export const useTranscribe = () => {
     tokenId?: number;
   }> => {
     const url = await transcode(mediaSrc);
-    const { targetId, targetType, originalText, language, service } =
-      params || {};
+    const {
+      targetId,
+      targetType,
+      originalText,
+      language,
+      service,
+      isolate = false,
+    } = params || {};
     const blob = await (await fetch(url)).blob();
 
     let result;
@@ -70,6 +78,8 @@ export const useTranscribe = () => {
       throw new Error(t("whisperServiceNotSupported"));
     }
 
+    setOutput(null);
+
     let transcript = originalText || result.text;
 
     // Remove all content inside `()`, `[]`, `{}` and trim the text
@@ -93,6 +103,7 @@ export const useTranscribe = () => {
       transcript,
       {
         language,
+        isolate,
       }
     );
 
@@ -193,7 +204,8 @@ export const useTranscribe = () => {
 
     return new Promise((resolve, reject) => {
       reco.recognizing = (_s, e) => {
-        console.log(e.result.text);
+        console.log(e.result);
+        setOutput(e.result.text);
       };
 
       reco.recognized = (_s, e) => {
@@ -230,5 +242,6 @@ export const useTranscribe = () => {
   return {
     transcode,
     transcribe,
+    output,
   };
 };
diff --git a/enjoy/src/renderer/hooks/use-transcriptions.tsx b/enjoy/src/renderer/hooks/use-transcriptions.tsx
index b5db427d..1d172703 100644
--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -16,11 +16,17 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
   );
   const { addDblistener, removeDbListener } = useContext(DbProviderContext);
   const [transcription, setTranscription] = useState<TranscriptionType>(null);
-  const { transcribe } = useTranscribe();
+  const { transcribe, output } = useTranscribe();
   const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
   const [transcribing, setTranscribing] = useState<boolean>(false);
+  const [transcribingOutput, setTranscribingOutput] = useState<string>("");
+  const [service, setService] = useState<WhisperConfigType["service"]>(
+    whisperConfig.service
+  );
 
   const onTransactionUpdate = (event: CustomEvent) => {
+    if (!transcription) return;
+
     const { model, action, record } = event.detail || {};
     if (
       model === "Transcription" &&
@@ -58,12 +64,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
     originalText?: string;
     language?: string;
     service?: WhisperConfigType["service"];
+    isolate?: boolean;
   }) => {
     let {
       originalText,
       language = learningLanguage,
       service = whisperConfig.service,
+      isolate = false,
     } = params || {};
+    setService(service);
+
     if (originalText === undefined) {
       if (transcription?.targetId === media.id) {
         originalText = transcription.result?.originalText;
@@ -77,131 +87,135 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
 
     setTranscribing(true);
     setTranscribingProgress(0);
-    try {
-      const { engine, model, alignmentResult, tokenId } = await transcribe(
-        media.src,
-        {
-          targetId: media.id,
-          targetType: media.mediaType,
-          originalText,
-          language,
-          service,
-        }
-      );
-
-      let timeline: TimelineEntry[] = [];
-      alignmentResult.timeline.forEach((t) => {
-        if (t.type === "sentence") {
-          timeline.push(t);
-        } else {
-          t.timeline.forEach((st) => {
-            timeline.push(st);
-          });
-        }
-      });
-
-      /*
-       * Pre-process
-       * 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc
-       * 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display;
-       * 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display;
-       */
-      try {
-        timeline.forEach((sentence, i) => {
-          const nextSentence = timeline[i + 1];
-          if (
-            !sentence.text
-              .replaceAll(MAGIC_TOKEN_REGEX, "")
-              .match(END_OF_SENTENCE_REGEX) &&
-            nextSentence?.text
-          ) {
-            nextSentence.text = [sentence.text, nextSentence.text].join(" ");
-            nextSentence.timeline = [
-              ...sentence.timeline,
-              ...nextSentence.timeline,
-            ];
-            nextSentence.startTime = sentence.startTime;
-            timeline.splice(i, 1);
-          } else {
-            const words = sentence.text.split(" ");
-
-            sentence.timeline.forEach((token, j) => {
-              const word = words[j]?.trim()?.toLowerCase();
-
-              const match = word?.match(/-|%/);
-              if (!match) return;
-
-              if (
-                word === "-" &&
-                token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase()
-              ) {
-                sentence.timeline.splice(j, 0, {
-                  type: "token",
-                  text: "-",
-                  startTime: sentence.timeline[j - 1]?.endTime || 0,
-                  endTime: sentence.timeline[j - 1]?.endTime || 0,
-                  timeline: [],
-                });
-                return;
-              }
-
-              for (let k = j + 1; k <= sentence.timeline.length - 1; k++) {
-                if (word.includes(sentence.timeline[k].text.toLowerCase())) {
-                  let connector = "";
-                  if (match[0] === "-") {
-                    connector = "-";
-                  }
-                  token.text = [token.text, sentence.timeline[k].text].join(
-                    connector
-                  );
-                  token.timeline = [
-                    ...token.timeline,
-                    ...sentence.timeline[k].timeline,
-                  ];
-                  token.endTime = sentence.timeline[k].endTime;
-                  sentence.timeline.splice(k, 1);
-                } else {
-                  break;
-                }
-              }
-            });
-          }
-        });
-      } catch (err) {
-        console.error(err);
-      }
-
-      await EnjoyApp.transcriptions.update(transcription.id, {
-        state: "finished",
-        result: {
-          timeline: timeline,
-          transcript: alignmentResult.transcript,
-          originalText,
-          tokenId,
-        },
-        engine,
-        model,
+    const { engine, model, alignmentResult, tokenId } = await transcribe(
+      media.src,
+      {
+        targetId: media.id,
+        targetType: media.mediaType,
+        originalText,
         language,
-      });
-
-      if (media.language !== language) {
-        if (media.mediaType === "Video") {
-          await EnjoyApp.videos.update(media.id, {
-            language,
-          });
-        } else {
-          await EnjoyApp.audios.update(media.id, {
-            language,
-          });
-        }
+        service,
+        isolate,
+      }
+    );
+
+    let timeline: TimelineEntry[] = [];
+    alignmentResult.timeline.forEach((t) => {
+      if (t.type === "sentence") {
+        timeline.push(t);
+      } else {
+        t.timeline.forEach((st) => {
+          timeline.push(st);
+        });
+      }
+    });
+
+    timeline = preProcessTranscription(timeline);
+    if (media.language !== language) {
+      if (media.mediaType === "Video") {
+        await EnjoyApp.videos.update(media.id, {
+          language,
+        });
+      } else {
+        await EnjoyApp.audios.update(media.id, {
+          language,
+        });
       }
-    } catch (err) {
-      toast.error(err.message);
     }
 
+    await EnjoyApp.transcriptions.update(transcription.id, {
+      state: "finished",
+      result: {
+        timeline: timeline,
+        transcript: alignmentResult.transcript,
+        originalText,
+        tokenId,
+      },
+      engine,
+      model,
+      language,
+    });
+
     setTranscribing(false);
   };
 
+  const preProcessTranscription = (timeline: TimelineEntry[]) => {
+    /*
+     * Pre-process
+     * 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc
+     * 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display;
+     * 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display;
+     */
+    try {
+      timeline.forEach((sentence, i) => {
+        const nextSentence = timeline[i + 1];
+        if (
+          !sentence.text
+            .replaceAll(MAGIC_TOKEN_REGEX, "")
+            .match(END_OF_SENTENCE_REGEX) &&
+          nextSentence?.text
+        ) {
+          nextSentence.text = [sentence.text, nextSentence.text].join(" ");
+          nextSentence.timeline = [
+            ...sentence.timeline,
+            ...nextSentence.timeline,
+          ];
+          nextSentence.startTime = sentence.startTime;
+          timeline.splice(i, 1);
+        } else {
+          const words = sentence.text.split(" ");
+
+          sentence.timeline.forEach((token, j) => {
+            const word = words[j]?.trim()?.toLowerCase();
+
+            const match = word?.match(/-|%/);
+            if (!match) return;
+
+            if (
+              word === "-" &&
+              token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase()
+            ) {
+              sentence.timeline.splice(j, 0, {
+                type: "token",
+                text: "-",
+                startTime: sentence.timeline[j - 1]?.endTime || 0,
+                endTime: sentence.timeline[j - 1]?.endTime || 0,
+                timeline: [],
+              });
+              return;
+            }
+
+            for (let k = j + 1; k <= sentence.timeline.length - 1; k++) {
+              if (word.includes(sentence.timeline[k].text.toLowerCase())) {
+                let connector = "";
+                if (match[0] === "-") {
+                  connector = "-";
+                }
+                token.text = [token.text, sentence.timeline[k].text].join(
+                  connector
+                );
+                token.timeline = [
+                  ...token.timeline,
+                  ...sentence.timeline[k].timeline,
+                ];
+                token.endTime = sentence.timeline[k].endTime;
+                sentence.timeline.splice(k, 1);
+              } else {
+                break;
+              }
+            }
+          });
+        }
+      });
+    } catch (err) {
+      console.warn(err);
+      toast.warning(
+        `Failed to pre-process transcription timeline: ${err.message}`
+      );
+    }
+    return timeline;
+  };
+
   const findTranscriptionFromWebApi = async () => {
     if (!transcription) {
       await findOrCreateTranscription();
@@ -252,32 +266,40 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
   }, [media]);
 
   /*
-   * auto-generate transcription result
+   * listen to transcription update
    */
   useEffect(() => {
     if (!transcription) return;
 
     addDblistener(onTransactionUpdate);
+    return () => {
+      removeDbListener(onTransactionUpdate);
+    };
+  }, [transcription]);
 
-    // if (
-    //   transcription.state == "pending" ||
-    //   !transcription.result?.["timeline"]
-    // ) {
-    //   findOrGenerateTranscription();
-    // }
+  /*
+   * listen to transcribe progress
+   */
+  useEffect(() => {
+    if (!transcribing) return;
 
-    if (whisperConfig.service === "local") {
+    if (service === "local") {
       EnjoyApp.whisper.onProgress((_, p: number) => {
         if (p > 100) p = 100;
         setTranscribingProgress(p);
       });
     }
 
+    EnjoyApp.app.onCmdOutput((_, output) => {
+      setTranscribingOutput(output);
+    });
+
     return () => {
-      removeDbListener(onTransactionUpdate);
       EnjoyApp.whisper.removeProgressListeners();
+      EnjoyApp.app.removeCmdOutputListeners();
+      setTranscribingOutput(null);
     };
-  }, [transcription, media]);
+  }, [media, service, transcribing]);
 
   const abortGenerateTranscription = () => {
     EnjoyApp.whisper.abort();
@@ -288,6 +310,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
     transcription,
     transcribingProgress,
     transcribing,
+    transcribingOutput: output || transcribingOutput,
     generateTranscription,
     abortGenerateTranscription,
   };
diff --git a/enjoy/src/types/enjoy-app.d.ts b/enjoy/src/types/enjoy-app.d.ts
index 07954ca6..f23ff54b 100644
--- a/enjoy/src/types/enjoy-app.d.ts
+++ b/enjoy/src/types/enjoy-app.d.ts
@@ -10,6 +10,8 @@ type EnjoyAppType = {
     quit: () => Promise<void>;
     openDevTools: () => Promise<void>;
     createIssue: (title: string, body: string) => Promise<void>;
+    onCmdOutput: (callback: (event, output: string) => void) => void;
+    removeCmdOutputListeners: () => void;
     version: string;
   };
   window: {