Feat: Improve alignment for the audio with background noise (#870)

* use echogarden.alignSegments * fix cloudflare whisper * refactor azure ai transcribe * refactor * fix align result * refactor * edit transcription in srt format * improve timeline * refactor * fix update current segment index * validate text when use upload transcript * add form description * refactor codes * do not change sentence timeline based on word timeline
2024-07-23 15:24:24 +08:00
parent 57403cdf47
commit 078f5159ff
22 changed files with 458 additions and 134 deletions
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -242,6 +242,7 @@
  "downloadFfmpeg": "Download FFmpeg",
  "youAreReadyToGo": "You are ready to go",
  "welcomeBack": "Welcome back! {{name}}",
+  "print": "Print",
  "download": "Download",
  "downloading": "Downloading {{file}}",
  "downloadedSuccessfully": "Downloaded successfully",
@@ -374,6 +375,7 @@
  "cloudflareAi": "Cloudflare AI",
  "cloudflareSpeechToTextDescription": "Use Cloudflare AI Worker to transcribe. It is in beta and free for now.",
  "openaiSpeechToTextDescription": "Use openAI to transcribe using your own key.",
+  "uploadSpeechToTextDescription": "Upload transcript file or input transcript text to align.",
  "checkingWhisper": "Checking whisper status",
  "pleaseDownloadWhisperModelFirst": "Please download whisper model first",
  "whisperIsWorkingGood": "Whisper is working good",
@@ -618,7 +620,8 @@
  "assessedSuccessfully": "Assessed successfully",
  "optinal": "Optional",
  "uploadTranscriptFile": "Upload transcript file",
-  "uploadTranscriptFileDescription": "Optional. Support formats: txt/srt/vtt.",
+  "uploadTranscriptFileDescription": "Support formats: txt/srt/vtt.",
+  "pleaseUploadTranscriptFile": "Please upload transcript file",
  "onlyTextFileIsSupported": "Only text file is supported",
  "isolateVoice": "Isolate voice(Experimental)",
  "isolateVoiceDescription": "Isolates voice from any music or background ambience. More accurate but slower",
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -242,6 +242,7 @@
  "downloadFfmpeg": "下载 FFmpeg",
  "youAreReadyToGo": "您已准备就绪",
  "welcomeBack": "欢迎回来, {{name}}",
+  "print": "打印",
  "download": "下载",
  "downloading": "正在下载 {{file}}",
  "downloadedSuccessfully": "下载成功",
@@ -374,6 +375,7 @@
  "cloudflareAi": "Cloudflare AI",
  "cloudflareSpeechToTextDescription": "使用 Cloudflare AI 进行语音转文本，目前免费",
  "openaiSpeechToTextDescription": "使用 OpenAI 进行语音转文本（需要 API 密钥）",
+  "uploadSpeechToTextDescription": "上传字幕文件或者输入文本进行字幕对齐",
  "checkingWhisper": "正在检查 Whisper",
  "pleaseDownloadWhisperModelFirst": "请先下载 Whisper 模型",
  "whisperIsWorkingGood": "Whisper 正常工作",
@@ -618,7 +620,8 @@
  "assessedSuccessfully": "评估成功",
  "optinal": "可选",
  "uploadTranscriptFile": "上传字幕文件",
-  "uploadTranscriptFileDescription": "可选。支持字幕文件格式: txt/srt/vtt。",
+  "uploadTranscriptFileDescription": "支持字幕文件格式: txt/srt/vtt。",
+  "pleaseUploadTranscriptFile": "请上传字幕文件",
  "onlyTextFileIsSupported": "仅支持文本文件",
  "isolateVoice": "提取人声(实验性)",
  "isolateVoiceDescription": "将人声从音乐、背景音中隔离，字幕对齐会更准确，但耗时较久。",
--- a/enjoy/src/main/echogarden.ts
+++ b/enjoy/src/main/echogarden.ts
@@ -1,7 +1,6 @@
 import { ipcMain } from "electron";
 import * as Echogarden from "echogarden/dist/api/API.js";
 import { AlignmentOptions } from "echogarden/dist/api/API";
-import { AudioSourceParam } from "echogarden/dist/audio/AudioUtilities";
 import {
  encodeRawAudioToWave,
  decodeWaveToRawAudio,
@@ -9,7 +8,9 @@ import {
  getRawAudioDuration,
  trimAudioStart,
  trimAudioEnd,
+  AudioSourceParam,
 } from "echogarden/dist/audio/AudioUtilities.js";
+import { Timeline } from "echogarden/dist/utilities/Timeline.d.js";
 import path from "path";
 import log from "@main/logger";
 import url from "url";
@@ -34,6 +35,7 @@ const __dirname = path
 const logger = log.scope("echogarden");
 class EchogardenWrapper {
  public align: typeof Echogarden.align;
+  public alignSegments: typeof Echogarden.alignSegments;
  public denoise: typeof Echogarden.denoise;
  public encodeRawAudioToWave: typeof encodeRawAudioToWave;
  public decodeWaveToRawAudio: typeof decodeWaveToRawAudio;
@@ -44,6 +46,7 @@ class EchogardenWrapper {

  constructor() {
    this.align = Echogarden.align;
+    this.alignSegments = Echogarden.alignSegments;
    this.denoise = Echogarden.denoise;
    this.encodeRawAudioToWave = encodeRawAudioToWave;
    this.decodeWaveToRawAudio = decodeWaveToRawAudio;
@@ -110,6 +113,25 @@ class EchogardenWrapper {
      }
    );

+    ipcMain.handle(
+      "echogarden-align-segments",
+      async (
+        _event,
+        input: AudioSourceParam,
+        timeline: Timeline,
+        options: AlignmentOptions
+      ) => {
+        logger.debug("echogarden-align-segments:", timeline, options);
+        try {
+          const rawAudio = await this.ensureRawAudio(input, 16000);
+          return await this.alignSegments(rawAudio, timeline, options);
+        } catch (err) {
+          logger.error(err);
+          throw err;
+        }
+      }
+    );
+
    ipcMain.handle(
      "echogarden-transcode",
      async (_event, url: string, sampleRate?: number) => {
--- a/enjoy/src/main/settings.ts
+++ b/enjoy/src/main/settings.ts
@@ -94,7 +94,7 @@ const userDataPath = () => {

 const apiUrl = () => {
  const url: string = settings.getSync("apiUrl") as string;
-  return process.env.API_URL || url || WEB_API_URL;
+  return process.env.WEB_API_URL || url || WEB_API_URL;
 };

 export default {
--- a/enjoy/src/main/whisper.ts
+++ b/enjoy/src/main/whisper.ts
@@ -105,6 +105,8 @@ class Whipser {
        `--model "${model.savePath}"`,
        "--output-json",
        `--output-file "${path.join(tmpDir, "jfk")}"`,
+        `--split-on-word true`,
+        `--max-len 1`,
      ];
      logger.debug(`Checking whisper command: ${commands.join(" ")}`);
      exec(
@@ -203,6 +205,9 @@ class Whipser {
      "--print-progress",
      "--language",
      model.name.includes("en") ? "en" : language?.split("-")?.[0] || "auto",
+      `--split-on-word`,
+      `--max-len`,
+      "1",
      ...extra,
    ];

--- a/enjoy/src/preload.ts
+++ b/enjoy/src/preload.ts
@@ -2,6 +2,7 @@
 // https://www.electronjs.org/docs/latest/tutorial/process-model#preload-scripts
 import { contextBridge, ipcRenderer, IpcRendererEvent } from "electron";
 import { version } from "../package.json";
+import { Timeline } from "echogarden/dist/utilities/Timeline";

 contextBridge.exposeInMainWorld("__ENJOY_APP__", {
  app: {
@@ -439,6 +440,9 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
    align: (input: string, transcript: string, options: any) => {
      return ipcRenderer.invoke("echogarden-align", input, transcript, options);
    },
+    alignSegments: (input: string, timeline: Timeline, options: any) => {
+      return ipcRenderer.invoke("echogarden-align-segments", input, timeline, options);
+    },
    transcode: (input: string) => {
      return ipcRenderer.invoke("echogarden-transcode", input);
    },
--- a/enjoy/src/renderer/components/medias/index.ts
+++ b/enjoy/src/renderer/components/medias/index.ts
@@ -12,4 +12,4 @@ export * from "./media-provider";
 export * from "./media-tabs";
 export * from "./media-loading-modal";
 export * from "./add-media-button";
-export * from "./media-transcription-download";
+export * from "./media-transcription-print";
--- a/enjoy/src/renderer/components/medias/media-caption.tsx
+++ b/enjoy/src/renderer/components/medias/media-caption.tsx
@@ -246,7 +246,6 @@ export const MediaCaption = () => {

    if (index < 0) return;
    if (index !== activeIndex) {
-      console.log("setActiveIndex", index);
      setActiveIndex(index);
    }
  }, [currentTime, caption]);
@@ -509,8 +508,8 @@ export const Caption = (props: {

  let words = caption.text.split(" ");
  const ipas = caption.timeline.map((w) =>
-    w.timeline.map((t) =>
-      language.startsWith("en")
+    w.timeline?.map((t) =>
+      t.timeline && language.startsWith("en")
        ? convertWordIpaToNormal(
            t.timeline.map((s) => s.text),
            { mappings: ipaMappings }
--- a/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx
+++ b/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx
@@ -5,7 +5,7 @@ import {
 } from "@renderer/context";
 import { TabsContent, Separator } from "@renderer/components/ui";
 import { t } from "i18next";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { convertWordIpaToNormal } from "@/utils";
 import {
  CamdictLookupResult,
@@ -41,7 +41,9 @@ const SelectedWords = (props: {
  const { selectedIndices, caption } = props;

  const { transcription } = useContext(MediaPlayerProviderContext);
-  const { learningLanguage, ipaMappings } = useContext(AppSettingsProviderContext);
+  const { learningLanguage, ipaMappings } = useContext(
+    AppSettingsProviderContext
+  );

  const word = selectedIndices
    .map((index) => caption.timeline[index]?.text || "")
--- a/enjoy/src/renderer/components/medias/media-player-controls.tsx
+++ b/enjoy/src/renderer/components/medias/media-player-controls.tsx
@@ -34,7 +34,7 @@ import { useHotkeys } from "react-hotkeys-hook";
 import cloneDeep from "lodash/cloneDeep";
 import debounce from "lodash/debounce";
 import { AlignmentResult } from "echogarden/dist/api/API.d.js";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";

 const PLAYBACK_RATE_OPTIONS = [0.75, 0.8, 0.9, 1.0];
 export const MediaPlayerControls = () => {
@@ -57,7 +57,7 @@ export const MediaPlayerControls = () => {
    setTranscriptionDraft,
  } = useContext(MediaPlayerProviderContext);
  const { EnjoyApp } = useContext(AppSettingsProviderContext);
-  const { currentHotkeys, enabled } = useContext(
+  const { currentHotkeys } = useContext(
    HotKeysSettingsProviderContext
  );
  const [playMode, setPlayMode] = useState<"loop" | "single" | "all">("single");
--- a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
@@ -76,7 +76,9 @@ export const MediaTranscriptionGenerateButton = (props: {
                generateTranscription({
                  originalText: data.text,
                  language: data.language,
-                  service: data.service as WhisperConfigType["service"],
+                  service: data.service as
+                    | WhisperConfigType["service"]
+                    | "upload",
                  isolate: data.isolate,
                })
                  .then(() => {
--- a/enjoy/src/renderer/components/medias/media-transcription-download.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-download.tsx
@@ -9,7 +9,7 @@ import { AlignmentResult } from "echogarden/dist/api/API.d.js";
 import { convertWordIpaToNormal } from "@/utils";
 import template from "./transcription.template.html?raw";

-export const MediaTranscriptionDownload = () => {
+export const MediaTranscriptionPrint = () => {
  const { media, transcription } = useContext(MediaPlayerProviderContext);
  const { EnjoyApp, learningLanguage, ipaMappings } = useContext(
    AppSettingsProviderContext
@@ -59,7 +59,7 @@ export const MediaTranscriptionDownload = () => {
  async function download() {
    try {
      const savePath = await EnjoyApp.dialog.showSaveDialog({
-        title: t("download"),
+        title: t("print"),
        defaultPath: `${media.name}.pdf`,
      });

@@ -75,7 +75,7 @@ export const MediaTranscriptionDownload = () => {

  return (
    <Button variant="ghost" className="block w-full" onClick={download}>
-      {t("download")}
+      {t("print")}
    </Button>
  );
 };
--- a/enjoy/src/renderer/components/medias/media-transcription-read-button.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-read-button.tsx
@@ -28,7 +28,7 @@ import {
  SheetHeader,
  toast,
 } from "@renderer/components/ui";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { t } from "i18next";
 import WaveSurfer from "wavesurfer.js";
 import {
--- a/enjoy/src/renderer/components/medias/media-transcription.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription.tsx
@@ -26,7 +26,7 @@ import { formatDuration } from "@renderer/lib/utils";
 import {
  MediaTranscriptionReadButton,
  MediaTranscriptionGenerateButton,
-  MediaTranscriptionDownload,
+  MediaTranscriptionPrint,
  TranscriptionEditButton,
 } from "@renderer/components";

@@ -165,7 +165,7 @@ export const MediaTranscription = (props: { display?: boolean }) => {
                  </TranscriptionEditButton>
                </DropdownMenuItem>
                <DropdownMenuItem asChild>
-                  <MediaTranscriptionDownload />
+                  <MediaTranscriptionPrint />
                </DropdownMenuItem>
              </DropdownMenuContent>
            </DropdownMenu>
--- a/enjoy/src/renderer/components/notes/note-segment.tsx
+++ b/enjoy/src/renderer/components/notes/note-segment.tsx
@@ -1,4 +1,4 @@
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { useContext, useState } from "react";
 import { WavesurferPlayer } from "@/renderer/components/misc";
 import { AppSettingsProviderContext } from "@/renderer/context";
--- a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
+++ b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
@@ -3,14 +3,11 @@ import {
  AppSettingsProviderContext,
 } from "@renderer/context";
 import { zodResolver } from "@hookform/resolvers/zod";
-import { useContext, useState } from "react";
+import { useContext } from "react";
 import { useForm } from "react-hook-form";
 import { z } from "zod";
 import {
  Button,
-  Collapsible,
-  CollapsibleContent,
-  CollapsibleTrigger,
  Form,
  FormDescription,
  FormField,
@@ -31,8 +28,9 @@ import {
 } from "@renderer/components/ui";
 import { t } from "i18next";
 import { LANGUAGES } from "@/constants";
-import { ChevronDownIcon, ChevronUpIcon, LoaderIcon } from "lucide-react";
+import { LoaderIcon } from "lucide-react";
 import { parseText } from "media-captions";
+import { milisecondsToTimestamp } from "@/utils";

 const transcriptionSchema = z.object({
  language: z.string(),
@@ -59,18 +57,28 @@ export const TranscriptionCreateForm = (props: {
  } = props;
  const { learningLanguage } = useContext(AppSettingsProviderContext);
  const { whisperConfig } = useContext(AISettingsProviderContext);
-  const [collapsibleOpen, setCollapsibleOpen] = useState(false);

  const form = useForm<z.infer<typeof transcriptionSchema>>({
    resolver: zodResolver(transcriptionSchema),
    values: {
      language: learningLanguage,
-      service: whisperConfig.service,
+      service: originalText ? "upload" : whisperConfig.service,
      text: originalText,
      isolate: false,
    },
  });

+  const handleSubmit = (data: z.infer<typeof transcriptionSchema>) => {
+    const { service, text } = data;
+
+    if (service === "upload" && !text) {
+      toast.error(t("pleaseUploadTranscriptFile"));
+      return;
+    }
+
+    onSubmit(data);
+  };
+
  const parseSubtitle = (file: File) => {
    const fileType = file.name.split(".").pop();
    return new Promise<string>((resolve, reject) => {
@@ -88,7 +96,16 @@ export const TranscriptionCreateForm = (props: {
        if (caption.cues.length === 0) {
          text = cleanSubtitleText(text as string);
        } else {
-          text = caption.cues.map((cue) => cue.text).join("\n");
+          // Write cues to text in SRT format
+          text = caption.cues
+            .map((cue, _) => {
+              return `${milisecondsToTimestamp(
+                cue.startTime * 1000
+              )} --> ${milisecondsToTimestamp(cue.endTime * 1000)}\n${
+                cue.text
+              }`;
+            })
+            .join("\n\n");
        }

        if (text.length === 0) {
@@ -126,7 +143,7 @@ export const TranscriptionCreateForm = (props: {
  return (
    <Form {...form}>
      <form
-        onSubmit={form.handleSubmit(onSubmit)}
+        onSubmit={form.handleSubmit(handleSubmit)}
        className="gap-4 grid w-full"
      >
        <FormField
@@ -150,8 +167,21 @@ export const TranscriptionCreateForm = (props: {
                    {t("cloudflareAi")}
                  </SelectItem>
                  <SelectItem value="openai">OpenAI</SelectItem>
+                  <SelectItem value="upload">{t("upload")}</SelectItem>
                </SelectContent>
              </Select>
+              <FormDescription>
+                {form.watch("service") === "local" &&
+                  t("localSpeechToTextDescription")}
+                {form.watch("service") === "azure" &&
+                  t("azureSpeechToTextDescription")}
+                {form.watch("service") === "cloudflare" &&
+                  t("cloudflareSpeechToTextDescription")}
+                {form.watch("service") === "openai" &&
+                  t("openaiSpeechToTextDescription")}
+                {form.watch("service") === "upload" &&
+                  t("uploadSpeechToTextDescription")}
+              </FormDescription>
            </FormItem>
          )}
        />
@@ -181,16 +211,14 @@ export const TranscriptionCreateForm = (props: {
            </FormItem>
          )}
        />
-        <Collapsible open={collapsibleOpen} onOpenChange={setCollapsibleOpen}>
-          <CollapsibleContent className="space-y-4 mb-4">
+        {form.watch("service") === "upload" && (
+          <>
            <FormField
              control={form.control}
              name="text"
              render={({ field }) => (
                <FormItem className="grid w-full items-center">
-                  <FormLabel>
-                    {t("uploadTranscriptFile")}({t("optinal")})
-                  </FormLabel>
+                  <FormLabel>{t("uploadTranscriptFile")}</FormLabel>
                  <Input
                    disabled={transcribing}
                    type="file"
@@ -245,25 +273,8 @@ export const TranscriptionCreateForm = (props: {
                </FormItem>
              )}
            />
-          </CollapsibleContent>
-          <div className="flex justify-center">
-            <CollapsibleTrigger asChild>
-              <Button variant="ghost" size="sm">
-                {collapsibleOpen ? (
-                  <>
-                    <ChevronUpIcon className="h-4 w-4" />
-                    <span className="ml-2">{t("lessOptions")}</span>
-                  </>
-                ) : (
-                  <>
-                    <ChevronDownIcon className="h-4 w-4" />
-                    <span className="ml-2">{t("moreOptions")}</span>
-                  </>
-                )}
-              </Button>
-            </CollapsibleTrigger>
-          </div>
-        </Collapsible>
+          </>
+        )}

        <TranscribeProgress
          service={form.watch("service")}
--- a/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
+++ b/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
@@ -20,26 +20,36 @@ import {
  Textarea,
  toast,
 } from "@renderer/components/ui";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { t } from "i18next";
-import { useContext, useState } from "react";
+import { useContext, useEffect, useState } from "react";
 import { LoaderIcon } from "lucide-react";
+import { milisecondsToTimestamp } from "@/utils";

 export const TranscriptionEditButton = (props: {
  children?: React.ReactNode;
 }) => {
-  const [open, setOpen] = useState(false);
-  const [submiting, setSubmiting] = useState(false);
-  const { transcription, generateTranscription } = useContext(
+  const { media, transcription, generateTranscription } = useContext(
    MediaPlayerProviderContext
  );
+  const [open, setOpen] = useState(false);
+  const [submiting, setSubmiting] = useState(false);
  const [content, setContent] = useState<string>(
-    transcription.result.timeline.map((t: TimelineEntry) => t.text).join("\n\n")
+    // generate text in SRT format from timeline entries
+    transcription.result.timeline
+      .map(
+        (t: TimelineEntry) =>
+          `${milisecondsToTimestamp(
+            t.startTime * 1000
+          )} --> ${milisecondsToTimestamp(t.endTime * 1000)}\n${t.text}`
+      )
+      .join("\n\n")
  );
+  const [downloadUrl, setDownloadUrl] = useState<string>();

  const handleSave = async () => {
    setSubmiting(true);
-    generateTranscription({ originalText: content })
+    generateTranscription({ originalText: content, service: "upload" })
      .then(() => setOpen(false))
      .catch((e) => {
        toast.error(e.message);
@@ -47,6 +57,13 @@ export const TranscriptionEditButton = (props: {
      .finally(() => setSubmiting(false));
  };

+  useEffect(() => {
+    if (!content) return;
+
+    const blob = new Blob([content], { type: "text/html" });
+    setDownloadUrl(URL.createObjectURL(blob));
+  }, [content]);
+
  return (
    <Dialog open={open} onOpenChange={setOpen}>
      <DialogTrigger asChild>
@@ -76,6 +93,11 @@ export const TranscriptionEditButton = (props: {
              {t("cancel")}
            </Button>
          </DialogClose>
+          <DialogClose asChild>
+            <a download={`${media.name}.srt`} href={downloadUrl}>
+              <Button variant="secondary">{t("download")}</Button>
+            </a>
+          </DialogClose>

          <AlertDialog>
            <AlertDialogTrigger asChild>
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -68,7 +68,7 @@ type MediaPlayerContextType = {
  generateTranscription: (params?: {
    originalText?: string;
    language?: string;
-    service?: WhisperConfigType["service"];
+    service?: WhisperConfigType["service"] | "upload";
    isolate?: boolean;
  }) => Promise<void>;
  transcribing: boolean;
@@ -352,7 +352,7 @@ export const MediaPlayerProvider = ({

        let phones: TimelineEntry[] = [];
        words.forEach((word: TimelineEntry) => {
-          word.timeline.forEach((token: TimelineEntry) => {
+          word.timeline?.forEach((token: TimelineEntry) => {
            phones = phones.concat(token.timeline);
          });
        });
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -8,9 +8,92 @@ import { t } from "i18next";
 import { AI_WORKER_ENDPOINT } from "@/constants";
 import * as sdk from "microsoft-cognitiveservices-speech-sdk";
 import axios from "axios";
-import { AlignmentResult } from "echogarden/dist/api/API.d.js";
 import { useAiCommand } from "./use-ai-command";
 import { toast } from "@renderer/components/ui";
+import {
+  Timeline,
+  TimelineEntry,
+  type TimelineEntryType,
+} from "echogarden/dist/utilities/Timeline";
+import take from "lodash/take";
+import sortedUniqBy from "lodash/sortedUniqBy";
+import { parseText } from "media-captions";
+
+/*
+ * define the regex pattern to match the end of a sentence
+ * the end of a sentence is defined as a period, question mark, or exclamation mark
+ * also it may be followed by a quotation mark
+ * and exclude sepecial cases like "Mr.", "Mrs.", "Dr.", "Ms.", "etc."
+ */
+const sentenceEndPattern = /(?<!Mr|Mrs|Dr|Ms|etc)\.|\?|!\"?/;
+
+// test a text string has any punctuations or not
+// some transcribed text may not have any punctuations
+const punctuationsPattern = /\w[.,!?](\s|$)/g;
+
+/*
+ * convert the word timeline to sentence timeline
+ * a sentence is a group of words that ends with a punctuation
+ */
+const wordTimelineToSentenceTimeline = (
+  wordTimeline: TimelineEntry[]
+): TimelineEntry[] => {
+  const timeline: TimelineEntry[] = [];
+
+  wordTimeline.forEach((word, index) => {
+    word.text = word.text.trim();
+    // skip empty words
+    if (!word.text) return;
+    // skip music or sound effects quoted in []
+    if (word.text.match(/^\[.*\]$/)) return;
+
+    const wordEntry = {
+      type: "word" as TimelineEntryType,
+      text: word.text,
+      startTime: word.startTime,
+      endTime: word.endTime,
+    };
+
+    let sentence: TimelineEntry;
+    // get the last sentence in the timeline
+    if (timeline.length > 0) {
+      sentence = timeline[timeline.length - 1];
+    }
+
+    // if there is no sentence in the timeline, create a new sentence
+    // if last sentence is a punctuation, create a new sentence
+    if (!sentence || sentence.text.match(sentenceEndPattern)) {
+      sentence = {
+        type: "sentence" as TimelineEntryType,
+        text: "",
+        startTime: wordEntry.startTime,
+        endTime: wordEntry.endTime,
+        timeline: [],
+      };
+      timeline.push(sentence);
+    }
+
+    // if the word is a punctuation, add it to the sentence and start a new sentence
+    if (wordEntry.text.match(sentenceEndPattern)) {
+      sentence.text += wordEntry.text;
+      sentence.endTime = wordEntry.endTime;
+
+      const lastSentence = timeline[timeline.length - 1];
+      if (lastSentence.endTime !== sentence.endTime) {
+        timeline.push(sentence);
+      }
+    } else {
+      sentence.text += wordEntry.text + " ";
+      sentence.endTime = wordEntry.endTime;
+
+      if (index === wordTimeline.length - 1) {
+        timeline.push(sentence);
+      }
+    }
+  });
+
+  return timeline;
+};

 export const useTranscribe = () => {
  const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
@@ -37,13 +120,14 @@ export const useTranscribe = () => {
      targetType?: string;
      originalText?: string;
      language: string;
-      service: WhisperConfigType["service"];
+      service: WhisperConfigType["service"] | "upload";
      isolate?: boolean;
    }
  ): Promise<{
    engine: string;
    model: string;
-    alignmentResult: AlignmentResult;
+    transcript: string;
+    timeline: TimelineEntry[];
    originalText?: string;
    tokenId?: number;
  }> => {
@@ -58,67 +142,152 @@ export const useTranscribe = () => {
    } = params || {};
    const blob = await (await fetch(url)).blob();

-    let result;
-    if (originalText) {
-      result = {
-        engine: "original",
-        model: "original",
-      };
+    let result: any;
+    let timeline: Timeline = [];
+    if (service === "upload" && originalText) {
+      const caption = await parseText(originalText, { type: "srt" });
+      if (caption.cues.length > 0) {
+        timeline = caption.cues.map((cue) => {
+          return {
+            type: "sentence",
+            text: cue.text,
+            startTime: cue.startTime,
+            endTime: cue.endTime,
+            timeline: [],
+          };
+        });
+        result = {
+          engine: "upload",
+          model: "-",
+          text: timeline.map((entry) => entry.text).join(" "),
+          timeline,
+        };
+      } else {
+        result = {
+          engine: "upload",
+          model: "-",
+          text: originalText,
+        };
+      }
    } else if (service === "local") {
      result = await transcribeByLocal(url, language);
    } else if (service === "cloudflare") {
      result = await transcribeByCloudflareAi(blob);
    } else if (service === "openai") {
-      result = await transcribeByOpenAi(blob);
+      result = await transcribeByOpenAi(
+        new File([blob], "audio.mp3", { type: "audio/mp3" })
+      );
    } else if (service === "azure") {
-      result = await transcribeByAzureAi(blob, language, {
-        targetId,
-        targetType,
-      });
+      result = await transcribeByAzureAi(
+        new File([blob], "audio.wav", { type: "audio/wav" }),
+        language,
+        {
+          targetId,
+          targetType,
+        }
+      );
    } else {
      throw new Error(t("whisperServiceNotSupported"));
    }
+    let transcript = result.text;

-    setOutput(null);
+    /*
+     * if timeline is available and the transcript contains punctuations
+     * use `alignSegments` to align each sentence with the timeline
+     * otherwise, use `align` to align the whole transcript
+     * if the transcript does not contain any punctuation, use AI command to add punctuation
+     */
+    if (result.timeline?.length && transcript.match(punctuationsPattern)) {
+      timeline = [...result.timeline];
+      setOutput("Aligning the transcript...");
+      const wordTimeline = await EnjoyApp.echogarden.alignSegments(
+        new Uint8Array(await blob.arrayBuffer()),
+        timeline,
+        {
+          language,
+          isolate,
+        }
+      );

-    let transcript = originalText || result.text;
+      wordTimeline.forEach((word: TimelineEntry) => {
+        let sentence = timeline.find(
+          (entry) =>
+            word.startTime >= entry.startTime && word.endTime <= entry.endTime
+        );

-    // Remove all content inside `()`, `[]`, `{}` and trim the text
-    // remove all markdown formatting
-    transcript = transcript
-      .replace(/\(.*?\)/g, "")
-      .replace(/\[.*?\]/g, "")
-      .replace(/\{.*?\}/g, "")
-      .replace(/[*_`]/g, "")
-      .trim();
+        if (sentence) {
+          sentence.timeline.push(word);
+        }
+      });

-    // if the transcript does not contain any punctuation, use AI command to add punctuation
-    if (!transcript.match(/\w[.,!?](\s|$)/)) {
-      try {
-        transcript = await punctuateText(transcript);
-      } catch (err) {
-        toast.error(err.message);
-        console.warn(err.message);
+      /*
+       * the start time of a sentence should be the start time of the first word in the sentence
+       * the end time of a sentence should the end time of the last word in the sentence
+       */
+      // timeline.forEach((t) => {
+      //   if (t.timeline.length === 0) return;
+
+      //   t.startTime = t.timeline[0].startTime;
+      //   t.endTime = t.timeline[t.timeline.length - 1].endTime;
+      // });
+    } else {
+      // Remove all content inside `()`, `[]`, `{}` and trim the text
+      // remove all markdown formatting
+      transcript = transcript
+        .replace(/\(.*?\)/g, "")
+        .replace(/\[.*?\]/g, "")
+        .replace(/\{.*?\}/g, "")
+        .replace(/[*_`]/g, "")
+        .trim();
+
+      // if the transcript does not contain any punctuation, use AI command to add punctuation
+      if (!transcript.match(punctuationsPattern)) {
+        try {
+          transcript = await punctuateText(transcript);
+        } catch (err) {
+          toast.error(err.message);
+          console.warn(err.message);
+        }
      }
+
+      setOutput("Aligning the transcript...");
+      const alignmentResult = await EnjoyApp.echogarden.align(
+        new Uint8Array(await blob.arrayBuffer()),
+        transcript,
+        {
+          language,
+          isolate,
+        }
+      );
+
+      alignmentResult.timeline.forEach((t: TimelineEntry) => {
+        if (t.type === "sentence") {
+          timeline.push(t);
+        } else {
+          t.timeline.forEach((st) => {
+            timeline.push(st);
+          });
+        }
+      });
    }

-    const alignmentResult = await EnjoyApp.echogarden.align(
-      new Uint8Array(await blob.arrayBuffer()),
-      transcript,
-      {
-        language,
-        isolate,
-      }
-    );
-
    return {
      ...result,
      originalText,
-      alignmentResult,
+      transcript,
+      timeline,
    };
  };

-  const transcribeByLocal = async (url: string, language?: string) => {
+  const transcribeByLocal = async (
+    url: string,
+    language?: string
+  ): Promise<{
+    engine: string;
+    model: string;
+    text: string;
+    timeline: TimelineEntry[];
+  }> => {
    const res = await EnjoyApp.whisper.transcribe(
      {
        file: url,
@@ -130,14 +299,25 @@ export const useTranscribe = () => {
      }
    );

+    const wordTimeline: TimelineEntry[] = res.transcription.map((word) => {
+      return {
+        type: "word" as TimelineEntryType,
+        text: word.text,
+        startTime: word.offsets.from / 1000.0,
+        endTime: word.offsets.to / 1000.0,
+      };
+    });
+    const timeline = wordTimelineToSentenceTimeline(wordTimeline);
+
    return {
      engine: "whisper",
      model: res.model.type,
      text: res.transcription.map((segment) => segment.text).join(" "),
+      timeline,
    };
  };

-  const transcribeByOpenAi = async (blob: Blob) => {
+  const transcribeByOpenAi = async (file: File) => {
    if (!openai?.key) {
      throw new Error(t("openaiKeyRequired"));
    }
@@ -149,20 +329,58 @@ export const useTranscribe = () => {
      maxRetries: 0,
    });

-    const res: { text: string } = (await client.audio.transcriptions.create({
-      file: new File([blob], "audio.wav"),
+    const res: {
+      text: string;
+      words?: { word: string; start: number; end: number }[];
+      segments?: { text: string; start: number; end: number }[];
+    } = (await client.audio.transcriptions.create({
+      file,
      model: "whisper-1",
-      response_format: "json",
+      response_format: "verbose_json",
+      timestamp_granularities: ["word"],
    })) as any;

+    let timeline: TimelineEntry[] = [];
+    if (res.segments) {
+      res.segments.forEach((segment) => {
+        const segmentTimeline = {
+          type: "sentence" as TimelineEntryType,
+          text: segment.text,
+          startTime: segment.start,
+          endTime: segment.end,
+          timeline: [] as Timeline,
+        };
+
+        timeline.push(segmentTimeline);
+      });
+    } else if (res.words) {
+      const wordTimeline = res.words.map((word) => {
+        return {
+          type: "word" as TimelineEntryType,
+          text: word.word,
+          startTime: word.start,
+          endTime: word.end,
+        };
+      });
+      timeline = wordTimelineToSentenceTimeline(wordTimeline);
+    }
+
    return {
      engine: "openai",
      model: "whisper-1",
      text: res.text,
+      timeline,
    };
  };

-  const transcribeByCloudflareAi = async (blob: Blob) => {
+  const transcribeByCloudflareAi = async (
+    blob: Blob
+  ): Promise<{
+    engine: string;
+    model: string;
+    text: string;
+    timeline?: TimelineEntry[];
+  }> => {
    const res: CfWhipserOutputType = (
      await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, blob, {
        headers: {
@@ -172,15 +390,26 @@ export const useTranscribe = () => {
      })
    ).data;

+    const wordTimeline = res.words.map((word) => {
+      return {
+        type: "word" as TimelineEntryType,
+        text: word.word,
+        startTime: word.start,
+        endTime: word.end,
+      };
+    });
+    const timeline = wordTimelineToSentenceTimeline(wordTimeline);
+
    return {
      engine: "cloudflare",
      model: "@cf/openai/whisper",
      text: res.text,
+      timeline,
    };
  };

  const transcribeByAzureAi = async (
-    blob: Blob,
+    file: File,
    language: string,
    params?: {
      targetId?: string;
@@ -191,12 +420,11 @@ export const useTranscribe = () => {
    model: string;
    text: string;
    tokenId: number;
+    timeline?: TimelineEntry[];
  }> => {
    const { id, token, region } = await webApi.generateSpeechToken(params);
    const config = sdk.SpeechConfig.fromAuthorizationToken(token, region);
-    const audioConfig = sdk.AudioConfig.fromWavFileInput(
-      new File([blob], "audio.wav")
-    );
+    const audioConfig = sdk.AudioConfig.fromWavFileInput(file);
    // setting the recognition language to learning language, such as 'en-US'.
    config.speechRecognitionLanguage = language;
    config.requestWordLevelTimestamps();
@@ -209,7 +437,6 @@ export const useTranscribe = () => {

    return new Promise((resolve, reject) => {
      reco.recognizing = (_s, e) => {
-        console.log(e.result);
        setOutput(e.result.text);
      };

@@ -232,10 +459,40 @@ export const useTranscribe = () => {
      reco.sessionStopped = (_s, _e) => {
        reco.stopContinuousRecognitionAsync();

+        const wordTimeline: TimelineEntry[] = [];
+        results.forEach((result) => {
+          const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
+          const splitedWords = best.Display.trim().split(" ");
+
+          best.Words.forEach((word, index) => {
+            let text = word.Word;
+            if (splitedWords.length === best.Words.length) {
+              text = splitedWords[index];
+            }
+
+            if (
+              index === best.Words.length - 1 &&
+              !text.trim().match(sentenceEndPattern)
+            ) {
+              text = text + ".";
+            }
+
+            wordTimeline.push({
+              type: "word" as TimelineEntryType,
+              text,
+              startTime: word.Offset / 10000000.0,
+              endTime: (word.Offset + word.Duration) / 10000000.0,
+            });
+          });
+        });
+
+        const timeline = wordTimelineToSentenceTimeline(wordTimeline);
+
        resolve({
          engine: "azure",
          model: "whisper",
          text: results.map((result) => result.DisplayText).join(" "),
+          timeline,
          tokenId: id,
        });
      };
--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -20,9 +20,9 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
  const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
  const [transcribing, setTranscribing] = useState<boolean>(false);
  const [transcribingOutput, setTranscribingOutput] = useState<string>("");
-  const [service, setService] = useState<WhisperConfigType["service"]>(
-    whisperConfig.service
-  );
+  const [service, setService] = useState<
+    WhisperConfigType["service"] | "upload"
+  >(whisperConfig.service);

  const onTransactionUpdate = (event: CustomEvent) => {
    if (!transcription) return;
@@ -63,7 +63,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
  const generateTranscription = async (params?: {
    originalText?: string;
    language?: string;
-    service?: WhisperConfigType["service"];
+    service?: WhisperConfigType["service"] | "upload";
    isolate?: boolean;
  }) => {
    let {
@@ -87,7 +87,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
          }
        }
      }
-      const { engine, model, alignmentResult, tokenId } = await transcribe(
+      const { engine, model, transcript, timeline, tokenId } = await transcribe(
        media.src,
        {
          targetId: media.id,
@@ -99,18 +99,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
        }
      );

-      let timeline: TimelineEntry[] = [];
-      alignmentResult.timeline.forEach((t) => {
-        if (t.type === "sentence") {
-          timeline.push(t);
-        } else {
-          t.timeline.forEach((st) => {
-            timeline.push(st);
-          });
-        }
-      });
-
-      timeline = preProcessTranscription(timeline);
+      const processedTimeline = preProcessTranscription(timeline);
      if (media.language !== language) {
        if (media.mediaType === "Video") {
          await EnjoyApp.videos.update(media.id, {
@@ -126,8 +115,8 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
      await EnjoyApp.transcriptions.update(transcription.id, {
        state: "finished",
        result: {
-          timeline: timeline,
-          transcript: alignmentResult.transcript,
+          timeline: processedTimeline,
+          transcript,
          originalText,
          tokenId,
        },
--- a/enjoy/src/types/enjoy-app.d.ts
+++ b/enjoy/src/types/enjoy-app.d.ts
@@ -252,6 +252,11 @@ type EnjoyAppType = {
      transcript: string,
      options?: any
    ) => Promise<AlignmentResult>;
+    alignSegments: (
+      input: string | Uint8Array,
+      timeline: Timeline,
+      options?: any
+    ) => Promise<Timeline>;
    transcode: (input: string) => Promise<string>;
    check: () => Promise<boolean>;
  };
--- a/enjoy/src/utils.ts
+++ b/enjoy/src/utils.ts
@@ -49,7 +49,7 @@ export function milisecondsToTimestamp(ms: number) {
  const hours = Math.floor(ms / 3600000).toString();
  const minutes = Math.floor((ms % 3600000) / 60000).toString();
  const seconds = Math.floor(((ms % 360000) % 60000) / 1000).toString();
-  const milliseconds = Math.floor(((ms % 360000) % 60000) % 1000).toString();
+  const milliseconds = Math.round(((ms % 360000) % 60000) % 1000).toString();
  return `${hours.padStart(2, "0")}:${minutes.padStart(
    2,
    "0"