Improve stt (#764)

* refactor * add isolate option for alignment * setup isolate for alignment * refactor transcription form * add transribing output * update locale * refactor * display transcribing output * cannot cancel when transcribing
2024-07-03 11:19:44 +08:00
parent d9534bcae8
commit f440947ea0
12 changed files with 315 additions and 172 deletions
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -611,8 +611,11 @@
  "assessing": "Assessing",
  "assessedSuccessfully": "Assessed successfully",
  "optinal": "Optional",
-  "uploadTranscriptFile": "Upload transcript file(.txt/.srt/.vtt)",
+  "uploadTranscriptFile": "Upload transcript file",
+  "uploadTranscriptFileDescription": "Optional. Support formats: txt/srt/vtt.",
  "onlyTextFileIsSupported": "Only text file is supported",
+  "isolateVoice": "Isolate voice",
+  "isolateVoiceDescription": "Isolates voice from any music or background ambience. More accurate but slower",
  "sortBy": "Sort by",
  "createdAtDesc": "Created at desc",
  "createdAtAsc": "Created at asc",
@@ -627,5 +630,6 @@
  "search": "Search",
  "noData": "No data",
  "selectedFiles": "Selected files",
-  "moreOptions": "More options"
+  "moreOptions": "More options",
+  "lessOptions": "Less options"
 }
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -611,8 +611,11 @@
  "assessing": "正在评估",
  "assessedSuccessfully": "评估成功",
  "optinal": "可选",
-  "uploadTranscriptFile": "上传字幕文件(.txt/.srt/.vtt)",
+  "uploadTranscriptFile": "上传字幕文件",
+  "uploadTranscriptFileDescription": "可选。支持字幕文件格式: txt/srt/vtt。",
  "onlyTextFileIsSupported": "仅支持文本文件",
+  "isolateVoice": "提取人声",
+  "isolateVoiceDescription": "将人声从音乐、背景音中隔离，字幕对齐会更准确，但耗时较久。",
  "sortBy": "排序",
  "createdAtDesc": "创建时间降序",
  "createdAtAsc": "创建时间升序",
@@ -627,5 +630,6 @@
  "search": "搜索",
  "noData": "没有数据",
  "selectedFiles": "已选中文件",
-  "moreOptions": "更多选项"
+  "moreOptions": "更多选项",
+  "lessOptions": "更少选项"
 }
--- a/enjoy/src/main/window.ts
+++ b/enjoy/src/main/window.ts
@@ -431,6 +431,28 @@ ${log}
    return { action: "allow" };
  });

+  // Capture stderr & stdout and send them to renderer
+  const originalStderrWrite = process.stderr.write.bind(process.stderr);
+  process.stderr.write = (chunk, encoding?, callback?) => {
+    // Remove ANSI color codes
+    const output = chunk
+      .toString()
+      .replace(/\x1B\[([0-9]{1,3}(;[0-9]{1,2};?)?)?[mGK]/g, "");
+    mainWindow.webContents.send("app-on-cmd-output", output);
+
+    return originalStderrWrite(chunk, encoding, callback);
+  };
+  const originalStdoutWrite = process.stdout.write.bind(process.stdout);
+  process.stdout.write = (chunk, encoding?, callback?) => {
+    // Remove ANSI color codes
+    const output = chunk
+      .toString()
+      .replace(/\x1B\[([0-9]{1,3}(;[0-9]{1,2};?)?)?[mGK]/g, "");
+    mainWindow.webContents.send("app-on-cmd-output", output);
+
+    return originalStdoutWrite(chunk, encoding, callback);
+  };
+
  // and load the index.html of the app.
  if (MAIN_WINDOW_VITE_DEV_SERVER_URL) {
    mainWindow.loadURL(MAIN_WINDOW_VITE_DEV_SERVER_URL);
--- a/enjoy/src/preload.ts
+++ b/enjoy/src/preload.ts
@@ -2,6 +2,8 @@
 // https://www.electronjs.org/docs/latest/tutorial/process-model#preload-scripts
 import { contextBridge, ipcRenderer, IpcRendererEvent } from "electron";
 import { version } from "../package.json";
+import { callback } from "chart.js/dist/helpers/helpers.core";
+import { remove } from "lodash";

 contextBridge.exposeInMainWorld("__ENJOY_APP__", {
  app: {
@@ -35,6 +37,12 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
    createIssue: (title: string, body: string) => {
      return ipcRenderer.invoke("app-create-issue", title, body);
    },
+    onCmdOutput: (callback: (event: IpcRendererEvent, data: string) => void) => {
+      ipcRenderer.on("app-on-cmd-output", callback);
+    },
+    removeCmdOutputListeners: () => {
+      ipcRenderer.removeAllListeners("app-on-cmd-output");
+    },
    version,
  },
  window: {
--- a/enjoy/src/renderer/components/medias/media-current-recording.tsx
+++ b/enjoy/src/renderer/components/medias/media-current-recording.tsx
@@ -64,7 +64,7 @@ export const MediaCurrentRecording = () => {
    currentTime: mediaCurrentTime,
  } = useContext(MediaPlayerProviderContext);
  const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext);
-  const { enabled, currentHotkeys } = useContext(
+  const { currentHotkeys } = useContext(
    HotKeysSettingsProviderContext
  );
  const [player, setPlayer] = useState(null);
--- a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
@@ -9,6 +9,7 @@ import {
  AlertDialogContent,
  AlertDialogTitle,
  AlertDialogDescription,
+  toast,
 } from "@renderer/components/ui";
 import { LoaderIcon } from "lucide-react";
 import { TranscriptionCreateForm } from "../transcriptions";
@@ -22,6 +23,7 @@ export const MediaTranscriptionGenerateButton = (props: {
    transcribing,
    transcription,
    transcribingProgress,
+    transcribingOutput,
  } = useContext(MediaPlayerProviderContext);
  const [open, setOpen] = useState(false);

@@ -62,11 +64,18 @@ export const MediaTranscriptionGenerateButton = (props: {
              originalText: data.text,
              language: data.language,
              service: data.service as WhisperConfigType["service"],
-            });
-            setOpen(false);
+              isolate: data.isolate,
+            })
+              .then(() => {
+                setOpen(false);
+              })
+              .catch((e) => {
+                toast.error(e.message);
+              });
          }}
          transcribing={transcribing}
          transcribingProgress={transcribingProgress}
+          transcribingOutput={transcribingOutput}
        />
      </AlertDialogContent>
    </AlertDialog>
--- a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
+++ b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
@@ -12,6 +12,7 @@ import {
  CollapsibleContent,
  CollapsibleTrigger,
  Form,
+  FormDescription,
  FormField,
  FormItem,
  FormLabel,
@@ -24,6 +25,7 @@ import {
  SelectItem,
  SelectTrigger,
  SelectValue,
+  Switch,
  Textarea,
  toast,
 } from "@renderer/components/ui";
@@ -36,18 +38,21 @@ const transcriptionSchema = z.object({
  language: z.string(),
  service: z.string(),
  text: z.string().optional(),
+  isolate: z.boolean().optional(),
 });

 export const TranscriptionCreateForm = (props: {
  onSubmit: (data: z.infer<typeof transcriptionSchema>) => void;
  originalText?: string;
  onCancel?: () => void;
-  transcribing?: boolean;
-  transcribingProgress?: number;
+  transcribing: boolean;
+  transcribingProgress: number;
+  transcribingOutput: string;
 }) => {
  const {
    transcribing = false,
    transcribingProgress = 0,
+    transcribingOutput,
    onSubmit,
    onCancel,
    originalText,
@@ -62,6 +67,7 @@ export const TranscriptionCreateForm = (props: {
      language: learningLanguage,
      service: whisperConfig.service,
      text: originalText,
+      isolate: false,
    },
  });

@@ -127,7 +133,7 @@ export const TranscriptionCreateForm = (props: {
          control={form.control}
          name="service"
          render={({ field }) => (
-            <FormItem className="grid w-full items-center gap-1.5">
+            <FormItem className="grid w-full items-center">
              <FormLabel>{t("sttAiService")}</FormLabel>
              <Select
                disabled={transcribing}
@@ -153,7 +159,7 @@ export const TranscriptionCreateForm = (props: {
          control={form.control}
          name="language"
          render={({ field }) => (
-            <FormItem className="grid w-full items-center gap-1.5">
+            <FormItem className="grid w-full items-center">
              <FormLabel>{t("language")}</FormLabel>
              <Select
                disabled={transcribing}
@@ -176,12 +182,12 @@ export const TranscriptionCreateForm = (props: {
          )}
        />
        <Collapsible open={collapsibleOpen} onOpenChange={setCollapsibleOpen}>
-          <CollapsibleContent>
+          <CollapsibleContent className="space-y-4 mb-4">
            <FormField
              control={form.control}
              name="text"
              render={({ field }) => (
-                <FormItem className="grid w-full items-center gap-1.5">
+                <FormItem className="grid w-full items-center">
                  <FormLabel>
                    {t("uploadTranscriptFile")}({t("optinal")})
                  </FormLabel>
@@ -205,6 +211,9 @@ export const TranscriptionCreateForm = (props: {
                      }
                    }}
                  />
+                  <FormDescription>
+                    {t("uploadTranscriptFileDescription")}
+                  </FormDescription>
                  {field.value != undefined && (
                    <>
                      <FormLabel>{t("transcript")}</FormLabel>
@@ -219,45 +228,92 @@ export const TranscriptionCreateForm = (props: {
                </FormItem>
              )}
            />
+            <FormField
+              control={form.control}
+              name="isolate"
+              render={({ field }) => (
+                <FormItem className="grid w-full items-center">
+                  <FormLabel>{t("isolateVoice")}</FormLabel>
+                  <Switch
+                    checked={field.value}
+                    onCheckedChange={field.onChange}
+                    disabled={transcribing}
+                  />
+                  <FormDescription>
+                    {t("isolateVoiceDescription")}
+                  </FormDescription>
+                </FormItem>
+              )}
+            />
          </CollapsibleContent>
-          <div className="flex justify-center my-4">
+          <div className="flex justify-center">
            <CollapsibleTrigger asChild>
              <Button variant="ghost" size="sm">
-                <span className="">{t("moreOptions")}</span>
                {collapsibleOpen ? (
-                  <ChevronUpIcon className="h-4 w-4" />
+                  <>
+                    <ChevronUpIcon className="h-4 w-4" />
+                    <span className="ml-2">{t("lessOptions")}</span>
+                  </>
                ) : (
-                  <ChevronDownIcon className="h-4 w-4" />
+                  <>
+                    <ChevronDownIcon className="h-4 w-4" />
+                    <span className="ml-2">{t("moreOptions")}</span>
+                  </>
                )}
              </Button>
            </CollapsibleTrigger>
          </div>
        </Collapsible>

-        {transcribing && form.watch("service") === "local" && (
-          <div className="mb-4">
-            <div className="flex items-center space-x-4 mb-2">
-              <PingPoint colorClassName="bg-yellow-500" />
-              <span>{t("transcribing")}</span>
-            </div>
-            {whisperConfig.service === "local" && (
-              <Progress value={transcribingProgress} />
-            )}
-          </div>
-        )}
+        <TranscribeProgress
+          service={form.watch("service")}
+          transcribing={transcribing}
+          transcribingProgress={transcribingProgress}
+          transcribingOutput={transcribingOutput}
+        />

        <div className="flex justify-end space-x-4">
-          {onCancel && (
+          {onCancel && !transcribing && (
            <Button type="reset" variant="outline" onClick={onCancel}>
              {t("cancel")}
            </Button>
          )}
          <Button disabled={transcribing} type="submit" variant="default">
            {transcribing && <LoaderIcon className="animate-spin w-4 mr-2" />}
-            {t("transcribe")}
+            {t("continue")}
          </Button>
        </div>
      </form>
    </Form>
  );
 };
+
+const TranscribeProgress = (props: {
+  service: string;
+  transcribing: boolean;
+  transcribingProgress: number;
+  transcribingOutput?: string;
+}) => {
+  const { service, transcribing, transcribingProgress, transcribingOutput } =
+    props;
+  if (!transcribing) return null;
+
+  return (
+    <div className="mb-4 space-y-2">
+      <div className="flex items-center space-x-4 mb-2">
+        <PingPoint colorClassName="bg-yellow-500" />
+        <span>{t("transcribing")}</span>
+      </div>
+      {service === "local" && transcribingProgress > 0 && (
+        <Progress value={transcribingProgress} />
+      )}
+      {transcribingOutput && (
+        <div className="max-w-full rounded-lg border bg-zinc-950 p-3 dark:bg-zinc-900 h-20 overflow-y-auto">
+          <code className="px-[0.3rem] py-[0.2rem] rounded text-muted-foreground font-mono text-xs break-words">
+            {transcribingOutput}
+          </code>
+        </div>
+      )}
+    </div>
+  );
+};
--- a/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
+++ b/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
@@ -39,14 +39,12 @@ export const TranscriptionEditButton = (props: {

  const handleSave = async () => {
    setSubmiting(true);
-    try {
-      await generateTranscription({ originalText: content });
-      setOpen(false);
-    } catch (e) {
-      toast.error(e.message);
-    }
-
-    setSubmiting(false);
+    generateTranscription({ originalText: content })
+      .then(() => setOpen(false))
+      .catch((e) => {
+        toast.error(e.message);
+      })
+      .finally(() => setSubmiting(false));
  };

  return (
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -69,9 +69,11 @@ type MediaPlayerContextType = {
    originalText?: string;
    language?: string;
    service?: WhisperConfigType["service"];
-  }) => void;
+    isolate?: boolean;
+  }) => Promise<void>;
  transcribing: boolean;
  transcribingProgress: number;
+  transcribingOutput: string;
  transcriptionDraft: TranscriptionType["result"];
  setTranscriptionDraft: (result: TranscriptionType["result"]) => void;
  // Recordings
@@ -172,6 +174,7 @@ export const MediaPlayerProvider = ({
    generateTranscription,
    transcribing,
    transcribingProgress,
+    transcribingOutput,
    abortGenerateTranscription,
  } = useTranscriptions(media);

@@ -611,6 +614,7 @@ export const MediaPlayerProvider = ({
          generateTranscription,
          transcribing,
          transcribingProgress,
+          transcribingOutput,
          transcriptionDraft,
          setTranscriptionDraft,
          isRecording,
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -3,7 +3,7 @@ import {
  AISettingsProviderContext,
 } from "@renderer/context";
 import OpenAI from "openai";
-import { useContext } from "react";
+import { useContext, useState } from "react";
 import { t } from "i18next";
 import { AI_WORKER_ENDPOINT } from "@/constants";
 import * as sdk from "microsoft-cognitiveservices-speech-sdk";
@@ -15,6 +15,7 @@ export const useTranscribe = () => {
  const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
  const { openai } = useContext(AISettingsProviderContext);
  const { punctuateText } = useAiCommand();
+  const [output, setOutput] = useState<string>("");

  const transcode = async (src: string | Blob): Promise<string> => {
    if (src instanceof Blob) {
@@ -36,6 +37,7 @@ export const useTranscribe = () => {
      originalText?: string;
      language: string;
      service: WhisperConfigType["service"];
+      isolate?: boolean;
    }
  ): Promise<{
    engine: string;
@@ -45,8 +47,14 @@ export const useTranscribe = () => {
    tokenId?: number;
  }> => {
    const url = await transcode(mediaSrc);
-    const { targetId, targetType, originalText, language, service } =
-      params || {};
+    const {
+      targetId,
+      targetType,
+      originalText,
+      language,
+      service,
+      isolate = false,
+    } = params || {};
    const blob = await (await fetch(url)).blob();

    let result;
@@ -70,6 +78,8 @@ export const useTranscribe = () => {
      throw new Error(t("whisperServiceNotSupported"));
    }

+    setOutput(null);
+
    let transcript = originalText || result.text;

    // Remove all content inside `()`, `[]`, `{}` and trim the text
@@ -93,6 +103,7 @@ export const useTranscribe = () => {
      transcript,
      {
        language,
+        isolate,
      }
    );

@@ -193,7 +204,8 @@ export const useTranscribe = () => {

    return new Promise((resolve, reject) => {
      reco.recognizing = (_s, e) => {
-        console.log(e.result.text);
+        console.log(e.result);
+        setOutput(e.result.text);
      };

      reco.recognized = (_s, e) => {
@@ -230,5 +242,6 @@ export const useTranscribe = () => {
  return {
    transcode,
    transcribe,
+    output,
  };
 };
--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -16,11 +16,17 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
  );
  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
  const [transcription, setTranscription] = useState<TranscriptionType>(null);
-  const { transcribe } = useTranscribe();
+  const { transcribe, output } = useTranscribe();
  const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
  const [transcribing, setTranscribing] = useState<boolean>(false);
+  const [transcribingOutput, setTranscribingOutput] = useState<string>("");
+  const [service, setService] = useState<WhisperConfigType["service"]>(
+    whisperConfig.service
+  );

  const onTransactionUpdate = (event: CustomEvent) => {
+    if (!transcription) return;
+
    const { model, action, record } = event.detail || {};
    if (
      model === "Transcription" &&
@@ -58,12 +64,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
    originalText?: string;
    language?: string;
    service?: WhisperConfigType["service"];
+    isolate?: boolean;
  }) => {
    let {
      originalText,
      language = learningLanguage,
      service = whisperConfig.service,
+      isolate = false,
    } = params || {};
+    setService(service);
+
    if (originalText === undefined) {
      if (transcription?.targetId === media.id) {
        originalText = transcription.result?.originalText;
@@ -77,131 +87,135 @@ export const useTranscriptions = (media: AudioType | VideoType) => {

    setTranscribing(true);
    setTranscribingProgress(0);
-    try {
-      const { engine, model, alignmentResult, tokenId } = await transcribe(
-        media.src,
-        {
-          targetId: media.id,
-          targetType: media.mediaType,
-          originalText,
-          language,
-          service,
-        }
-      );
-
-      let timeline: TimelineEntry[] = [];
-      alignmentResult.timeline.forEach((t) => {
-        if (t.type === "sentence") {
-          timeline.push(t);
-        } else {
-          t.timeline.forEach((st) => {
-            timeline.push(st);
-          });
-        }
-      });
-
-      /*
-       * Pre-process
-       * 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc
-       * 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display;
-       * 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display;
-       */
-      try {
-        timeline.forEach((sentence, i) => {
-          const nextSentence = timeline[i + 1];
-          if (
-            !sentence.text
-              .replaceAll(MAGIC_TOKEN_REGEX, "")
-              .match(END_OF_SENTENCE_REGEX) &&
-            nextSentence?.text
-          ) {
-            nextSentence.text = [sentence.text, nextSentence.text].join(" ");
-            nextSentence.timeline = [
-              ...sentence.timeline,
-              ...nextSentence.timeline,
-            ];
-            nextSentence.startTime = sentence.startTime;
-            timeline.splice(i, 1);
-          } else {
-            const words = sentence.text.split(" ");
-
-            sentence.timeline.forEach((token, j) => {
-              const word = words[j]?.trim()?.toLowerCase();
-
-              const match = word?.match(/-|%/);
-              if (!match) return;
-
-              if (
-                word === "-" &&
-                token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase()
-              ) {
-                sentence.timeline.splice(j, 0, {
-                  type: "token",
-                  text: "-",
-                  startTime: sentence.timeline[j - 1]?.endTime || 0,
-                  endTime: sentence.timeline[j - 1]?.endTime || 0,
-                  timeline: [],
-                });
-                return;
-              }
-
-              for (let k = j + 1; k <= sentence.timeline.length - 1; k++) {
-                if (word.includes(sentence.timeline[k].text.toLowerCase())) {
-                  let connector = "";
-                  if (match[0] === "-") {
-                    connector = "-";
-                  }
-                  token.text = [token.text, sentence.timeline[k].text].join(
-                    connector
-                  );
-                  token.timeline = [
-                    ...token.timeline,
-                    ...sentence.timeline[k].timeline,
-                  ];
-                  token.endTime = sentence.timeline[k].endTime;
-                  sentence.timeline.splice(k, 1);
-                } else {
-                  break;
-                }
-              }
-            });
-          }
-        });
-      } catch (err) {
-        console.error(err);
-      }
-
-      await EnjoyApp.transcriptions.update(transcription.id, {
-        state: "finished",
-        result: {
-          timeline: timeline,
-          transcript: alignmentResult.transcript,
-          originalText,
-          tokenId,
-        },
-        engine,
-        model,
+    const { engine, model, alignmentResult, tokenId } = await transcribe(
+      media.src,
+      {
+        targetId: media.id,
+        targetType: media.mediaType,
+        originalText,
        language,
-      });
-
-      if (media.language !== language) {
-        if (media.mediaType === "Video") {
-          await EnjoyApp.videos.update(media.id, {
-            language,
-          });
-        } else {
-          await EnjoyApp.audios.update(media.id, {
-            language,
-          });
-        }
+        service,
+        isolate,
+      }
+    );
+
+    let timeline: TimelineEntry[] = [];
+    alignmentResult.timeline.forEach((t) => {
+      if (t.type === "sentence") {
+        timeline.push(t);
+      } else {
+        t.timeline.forEach((st) => {
+          timeline.push(st);
+        });
+      }
+    });
+
+    timeline = preProcessTranscription(timeline);
+    if (media.language !== language) {
+      if (media.mediaType === "Video") {
+        await EnjoyApp.videos.update(media.id, {
+          language,
+        });
+      } else {
+        await EnjoyApp.audios.update(media.id, {
+          language,
+        });
      }
-    } catch (err) {
-      toast.error(err.message);
    }

+    await EnjoyApp.transcriptions.update(transcription.id, {
+      state: "finished",
+      result: {
+        timeline: timeline,
+        transcript: alignmentResult.transcript,
+        originalText,
+        tokenId,
+      },
+      engine,
+      model,
+      language,
+    });
+
    setTranscribing(false);
  };

+  const preProcessTranscription = (timeline: TimelineEntry[]) => {
+    /*
+     * Pre-process
+     * 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc
+     * 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display;
+     * 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display;
+     */
+    try {
+      timeline.forEach((sentence, i) => {
+        const nextSentence = timeline[i + 1];
+        if (
+          !sentence.text
+            .replaceAll(MAGIC_TOKEN_REGEX, "")
+            .match(END_OF_SENTENCE_REGEX) &&
+          nextSentence?.text
+        ) {
+          nextSentence.text = [sentence.text, nextSentence.text].join(" ");
+          nextSentence.timeline = [
+            ...sentence.timeline,
+            ...nextSentence.timeline,
+          ];
+          nextSentence.startTime = sentence.startTime;
+          timeline.splice(i, 1);
+        } else {
+          const words = sentence.text.split(" ");
+
+          sentence.timeline.forEach((token, j) => {
+            const word = words[j]?.trim()?.toLowerCase();
+
+            const match = word?.match(/-|%/);
+            if (!match) return;
+
+            if (
+              word === "-" &&
+              token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase()
+            ) {
+              sentence.timeline.splice(j, 0, {
+                type: "token",
+                text: "-",
+                startTime: sentence.timeline[j - 1]?.endTime || 0,
+                endTime: sentence.timeline[j - 1]?.endTime || 0,
+                timeline: [],
+              });
+              return;
+            }
+
+            for (let k = j + 1; k <= sentence.timeline.length - 1; k++) {
+              if (word.includes(sentence.timeline[k].text.toLowerCase())) {
+                let connector = "";
+                if (match[0] === "-") {
+                  connector = "-";
+                }
+                token.text = [token.text, sentence.timeline[k].text].join(
+                  connector
+                );
+                token.timeline = [
+                  ...token.timeline,
+                  ...sentence.timeline[k].timeline,
+                ];
+                token.endTime = sentence.timeline[k].endTime;
+                sentence.timeline.splice(k, 1);
+              } else {
+                break;
+              }
+            }
+          });
+        }
+      });
+    } catch (err) {
+      console.warn(err);
+      toast.warning(
+        `Failed to pre-process transcription timeline: ${err.message}`
+      );
+    }
+    return timeline;
+  };
+
  const findTranscriptionFromWebApi = async () => {
    if (!transcription) {
      await findOrCreateTranscription();
@@ -252,32 +266,40 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
  }, [media]);

  /*
-   * auto-generate transcription result
+   * listen to transcription update
   */
  useEffect(() => {
    if (!transcription) return;

    addDblistener(onTransactionUpdate);
+    return () => {
+      removeDbListener(onTransactionUpdate);
+    };
+  }, [transcription]);

-    // if (
-    //   transcription.state == "pending" ||
-    //   !transcription.result?.["timeline"]
-    // ) {
-    //   findOrGenerateTranscription();
-    // }
+  /*
+   * listen to transcribe progress
+   */
+  useEffect(() => {
+    if (!transcribing) return;

-    if (whisperConfig.service === "local") {
+    if (service === "local") {
      EnjoyApp.whisper.onProgress((_, p: number) => {
        if (p > 100) p = 100;
        setTranscribingProgress(p);
      });
    }

+    EnjoyApp.app.onCmdOutput((_, output) => {
+      setTranscribingOutput(output);
+    });
+
    return () => {
-      removeDbListener(onTransactionUpdate);
      EnjoyApp.whisper.removeProgressListeners();
+      EnjoyApp.app.removeCmdOutputListeners();
+      setTranscribingOutput(null);
    };
-  }, [transcription, media]);
+  }, [media, service, transcribing]);

  const abortGenerateTranscription = () => {
    EnjoyApp.whisper.abort();
@@ -288,6 +310,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
    transcription,
    transcribingProgress,
    transcribing,
+    transcribingOutput: output || transcribingOutput,
    generateTranscription,
    abortGenerateTranscription,
  };
--- a/enjoy/src/types/enjoy-app.d.ts
+++ b/enjoy/src/types/enjoy-app.d.ts
@@ -10,6 +10,8 @@ type EnjoyAppType = {
    quit: () => Promise<void>;
    openDevTools: () => Promise<void>;
    createIssue: (title: string, body: string) => Promise<void>;
+    onCmdOutput: (callback: (event, output: string) => void) => void;
+    removeCmdOutputListeners: () => void;
    version: string;
  };
  window: {