diff --git a/enjoy/src/i18n/en.json b/enjoy/src/i18n/en.json
index 3a2af169..efccecee 100644
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -242,6 +242,7 @@
   "downloadFfmpeg": "Download FFmpeg",
   "youAreReadyToGo": "You are ready to go",
   "welcomeBack": "Welcome back! {{name}}",
+  "print": "Print",
   "download": "Download",
   "downloading": "Downloading {{file}}",
   "downloadedSuccessfully": "Downloaded successfully",
@@ -374,6 +375,7 @@
   "cloudflareAi": "Cloudflare AI",
   "cloudflareSpeechToTextDescription": "Use Cloudflare AI Worker to transcribe. It is in beta and free for now.",
   "openaiSpeechToTextDescription": "Use openAI to transcribe using your own key.",
+  "uploadSpeechToTextDescription": "Upload transcript file or input transcript text to align.",
   "checkingWhisper": "Checking whisper status",
   "pleaseDownloadWhisperModelFirst": "Please download whisper model first",
   "whisperIsWorkingGood": "Whisper is working good",
@@ -618,7 +620,8 @@
   "assessedSuccessfully": "Assessed successfully",
   "optinal": "Optional",
   "uploadTranscriptFile": "Upload transcript file",
-  "uploadTranscriptFileDescription": "Optional. Support formats: txt/srt/vtt.",
+  "uploadTranscriptFileDescription": "Support formats: txt/srt/vtt.",
+  "pleaseUploadTranscriptFile": "Please upload transcript file",
   "onlyTextFileIsSupported": "Only text file is supported",
   "isolateVoice": "Isolate voice(Experimental)",
   "isolateVoiceDescription": "Isolates voice from any music or background ambience. More accurate but slower",
diff --git a/enjoy/src/i18n/zh-CN.json b/enjoy/src/i18n/zh-CN.json
index a8220b4f..fd98891d 100644
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -242,6 +242,7 @@
   "downloadFfmpeg": "下载 FFmpeg",
   "youAreReadyToGo": "您已准备就绪",
   "welcomeBack": "欢迎回来, {{name}}",
+  "print": "打印",
   "download": "下载",
   "downloading": "正在下载 {{file}}",
   "downloadedSuccessfully": "下载成功",
@@ -374,6 +375,7 @@
   "cloudflareAi": "Cloudflare AI",
   "cloudflareSpeechToTextDescription": "使用 Cloudflare AI 进行语音转文本，目前免费",
   "openaiSpeechToTextDescription": "使用 OpenAI 进行语音转文本（需要 API 密钥）",
+  "uploadSpeechToTextDescription": "上传字幕文件或者输入文本进行字幕对齐",
   "checkingWhisper": "正在检查 Whisper",
   "pleaseDownloadWhisperModelFirst": "请先下载 Whisper 模型",
   "whisperIsWorkingGood": "Whisper 正常工作",
@@ -618,7 +620,8 @@
   "assessedSuccessfully": "评估成功",
   "optinal": "可选",
   "uploadTranscriptFile": "上传字幕文件",
-  "uploadTranscriptFileDescription": "可选。支持字幕文件格式: txt/srt/vtt。",
+  "uploadTranscriptFileDescription": "支持字幕文件格式: txt/srt/vtt。",
+  "pleaseUploadTranscriptFile": "请上传字幕文件",
   "onlyTextFileIsSupported": "仅支持文本文件",
   "isolateVoice": "提取人声(实验性)",
   "isolateVoiceDescription": "将人声从音乐、背景音中隔离，字幕对齐会更准确，但耗时较久。",
diff --git a/enjoy/src/main/echogarden.ts b/enjoy/src/main/echogarden.ts
index ae8de742..217ac5f4 100644
--- a/enjoy/src/main/echogarden.ts
+++ b/enjoy/src/main/echogarden.ts
@@ -1,7 +1,6 @@
 import { ipcMain } from "electron";
 import * as Echogarden from "echogarden/dist/api/API.js";
 import { AlignmentOptions } from "echogarden/dist/api/API";
-import { AudioSourceParam } from "echogarden/dist/audio/AudioUtilities";
 import {
   encodeRawAudioToWave,
   decodeWaveToRawAudio,
@@ -9,7 +8,9 @@ import {
   getRawAudioDuration,
   trimAudioStart,
   trimAudioEnd,
+  AudioSourceParam,
 } from "echogarden/dist/audio/AudioUtilities.js";
+import { Timeline } from "echogarden/dist/utilities/Timeline.d.js";
 import path from "path";
 import log from "@main/logger";
 import url from "url";
@@ -34,6 +35,7 @@ const __dirname = path
 const logger = log.scope("echogarden");
 class EchogardenWrapper {
   public align: typeof Echogarden.align;
+  public alignSegments: typeof Echogarden.alignSegments;
   public denoise: typeof Echogarden.denoise;
   public encodeRawAudioToWave: typeof encodeRawAudioToWave;
   public decodeWaveToRawAudio: typeof decodeWaveToRawAudio;
@@ -44,6 +46,7 @@ class EchogardenWrapper {
 
   constructor() {
     this.align = Echogarden.align;
+    this.alignSegments = Echogarden.alignSegments;
     this.denoise = Echogarden.denoise;
     this.encodeRawAudioToWave = encodeRawAudioToWave;
     this.decodeWaveToRawAudio = decodeWaveToRawAudio;
@@ -110,6 +113,25 @@ class EchogardenWrapper {
       }
     );
 
+    ipcMain.handle(
+      "echogarden-align-segments",
+      async (
+        _event,
+        input: AudioSourceParam,
+        timeline: Timeline,
+        options: AlignmentOptions
+      ) => {
+        logger.debug("echogarden-align-segments:", timeline, options);
+        try {
+          const rawAudio = await this.ensureRawAudio(input, 16000);
+          return await this.alignSegments(rawAudio, timeline, options);
+        } catch (err) {
+          logger.error(err);
+          throw err;
+        }
+      }
+    );
+
     ipcMain.handle(
       "echogarden-transcode",
       async (_event, url: string, sampleRate?: number) => {
diff --git a/enjoy/src/main/settings.ts b/enjoy/src/main/settings.ts
index e5b8f21b..8cb2b3bb 100644
--- a/enjoy/src/main/settings.ts
+++ b/enjoy/src/main/settings.ts
@@ -94,7 +94,7 @@ const userDataPath = () => {
 
 const apiUrl = () => {
   const url: string = settings.getSync("apiUrl") as string;
-  return process.env.API_URL || url || WEB_API_URL;
+  return process.env.WEB_API_URL || url || WEB_API_URL;
 };
 
 export default {
diff --git a/enjoy/src/main/whisper.ts b/enjoy/src/main/whisper.ts
index 0e00927d..287b257d 100644
--- a/enjoy/src/main/whisper.ts
+++ b/enjoy/src/main/whisper.ts
@@ -105,6 +105,8 @@ class Whipser {
         `--model "${model.savePath}"`,
         "--output-json",
         `--output-file "${path.join(tmpDir, "jfk")}"`,
+        `--split-on-word true`,
+        `--max-len 1`,
       ];
       logger.debug(`Checking whisper command: ${commands.join(" ")}`);
       exec(
@@ -203,6 +205,9 @@ class Whipser {
       "--print-progress",
       "--language",
       model.name.includes("en") ? "en" : language?.split("-")?.[0] || "auto",
+      `--split-on-word`,
+      `--max-len`,
+      "1",
       ...extra,
     ];
 
diff --git a/enjoy/src/preload.ts b/enjoy/src/preload.ts
index d9b6900a..2d1789d5 100644
--- a/enjoy/src/preload.ts
+++ b/enjoy/src/preload.ts
@@ -2,6 +2,7 @@
 // https://www.electronjs.org/docs/latest/tutorial/process-model#preload-scripts
 import { contextBridge, ipcRenderer, IpcRendererEvent } from "electron";
 import { version } from "../package.json";
+import { Timeline } from "echogarden/dist/utilities/Timeline";
 
 contextBridge.exposeInMainWorld("__ENJOY_APP__", {
   app: {
@@ -439,6 +440,9 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
     align: (input: string, transcript: string, options: any) => {
       return ipcRenderer.invoke("echogarden-align", input, transcript, options);
     },
+    alignSegments: (input: string, timeline: Timeline, options: any) => {
+      return ipcRenderer.invoke("echogarden-align-segments", input, timeline, options);
+    },
     transcode: (input: string) => {
       return ipcRenderer.invoke("echogarden-transcode", input);
     },
diff --git a/enjoy/src/renderer/components/medias/index.ts b/enjoy/src/renderer/components/medias/index.ts
index a99b6b46..f57e0d14 100644
--- a/enjoy/src/renderer/components/medias/index.ts
+++ b/enjoy/src/renderer/components/medias/index.ts
@@ -12,4 +12,4 @@ export * from "./media-provider";
 export * from "./media-tabs";
 export * from "./media-loading-modal";
 export * from "./add-media-button";
-export * from "./media-transcription-download";
+export * from "./media-transcription-print";
diff --git a/enjoy/src/renderer/components/medias/media-caption.tsx b/enjoy/src/renderer/components/medias/media-caption.tsx
index eb4a8d27..e5b3f60b 100644
--- a/enjoy/src/renderer/components/medias/media-caption.tsx
+++ b/enjoy/src/renderer/components/medias/media-caption.tsx
@@ -246,7 +246,6 @@ export const MediaCaption = () => {
 
     if (index < 0) return;
     if (index !== activeIndex) {
-      console.log("setActiveIndex", index);
       setActiveIndex(index);
     }
   }, [currentTime, caption]);
@@ -509,8 +508,8 @@ export const Caption = (props: {
 
   let words = caption.text.split(" ");
   const ipas = caption.timeline.map((w) =>
-    w.timeline.map((t) =>
-      language.startsWith("en")
+    w.timeline?.map((t) =>
+      t.timeline && language.startsWith("en")
         ? convertWordIpaToNormal(
             t.timeline.map((s) => s.text),
             { mappings: ipaMappings }
diff --git a/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx b/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx
index a0a90f8d..0582ea07 100644
--- a/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx
+++ b/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx
@@ -5,7 +5,7 @@ import {
 } from "@renderer/context";
 import { TabsContent, Separator } from "@renderer/components/ui";
 import { t } from "i18next";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { convertWordIpaToNormal } from "@/utils";
 import {
   CamdictLookupResult,
@@ -41,7 +41,9 @@ const SelectedWords = (props: {
   const { selectedIndices, caption } = props;
 
   const { transcription } = useContext(MediaPlayerProviderContext);
-  const { learningLanguage, ipaMappings } = useContext(AppSettingsProviderContext);
+  const { learningLanguage, ipaMappings } = useContext(
+    AppSettingsProviderContext
+  );
 
   const word = selectedIndices
     .map((index) => caption.timeline[index]?.text || "")
diff --git a/enjoy/src/renderer/components/medias/media-player-controls.tsx b/enjoy/src/renderer/components/medias/media-player-controls.tsx
index d778f2c2..b387bd2c 100644
--- a/enjoy/src/renderer/components/medias/media-player-controls.tsx
+++ b/enjoy/src/renderer/components/medias/media-player-controls.tsx
@@ -34,7 +34,7 @@ import { useHotkeys } from "react-hotkeys-hook";
 import cloneDeep from "lodash/cloneDeep";
 import debounce from "lodash/debounce";
 import { AlignmentResult } from "echogarden/dist/api/API.d.js";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 
 const PLAYBACK_RATE_OPTIONS = [0.75, 0.8, 0.9, 1.0];
 export const MediaPlayerControls = () => {
@@ -57,7 +57,7 @@ export const MediaPlayerControls = () => {
     setTranscriptionDraft,
   } = useContext(MediaPlayerProviderContext);
   const { EnjoyApp } = useContext(AppSettingsProviderContext);
-  const { currentHotkeys, enabled } = useContext(
+  const { currentHotkeys } = useContext(
     HotKeysSettingsProviderContext
   );
   const [playMode, setPlayMode] = useState<"loop" | "single" | "all">("single");
diff --git a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
index 6d64262e..e17efd4d 100644
--- a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx
@@ -76,7 +76,9 @@ export const MediaTranscriptionGenerateButton = (props: {
                 generateTranscription({
                   originalText: data.text,
                   language: data.language,
-                  service: data.service as WhisperConfigType["service"],
+                  service: data.service as
+                    | WhisperConfigType["service"]
+                    | "upload",
                   isolate: data.isolate,
                 })
                   .then(() => {
diff --git a/enjoy/src/renderer/components/medias/media-transcription-download.tsx b/enjoy/src/renderer/components/medias/media-transcription-print.tsx
similarity index 95%
rename from enjoy/src/renderer/components/medias/media-transcription-download.tsx
rename to enjoy/src/renderer/components/medias/media-transcription-print.tsx
index 970ab998..ba959033 100644
--- a/enjoy/src/renderer/components/medias/media-transcription-download.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-print.tsx
@@ -9,7 +9,7 @@ import { AlignmentResult } from "echogarden/dist/api/API.d.js";
 import { convertWordIpaToNormal } from "@/utils";
 import template from "./transcription.template.html?raw";
 
-export const MediaTranscriptionDownload = () => {
+export const MediaTranscriptionPrint = () => {
   const { media, transcription } = useContext(MediaPlayerProviderContext);
   const { EnjoyApp, learningLanguage, ipaMappings } = useContext(
     AppSettingsProviderContext
@@ -59,7 +59,7 @@ export const MediaTranscriptionDownload = () => {
   async function download() {
     try {
       const savePath = await EnjoyApp.dialog.showSaveDialog({
-        title: t("download"),
+        title: t("print"),
         defaultPath: `${media.name}.pdf`,
       });
 
@@ -75,7 +75,7 @@ export const MediaTranscriptionDownload = () => {
 
   return (
     <Button variant="ghost" className="block w-full" onClick={download}>
-      {t("download")}
+      {t("print")}
     </Button>
   );
 };
diff --git a/enjoy/src/renderer/components/medias/media-transcription-read-button.tsx b/enjoy/src/renderer/components/medias/media-transcription-read-button.tsx
index 92635c85..0807a702 100644
--- a/enjoy/src/renderer/components/medias/media-transcription-read-button.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-read-button.tsx
@@ -28,7 +28,7 @@ import {
   SheetHeader,
   toast,
 } from "@renderer/components/ui";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { t } from "i18next";
 import WaveSurfer from "wavesurfer.js";
 import {
diff --git a/enjoy/src/renderer/components/medias/media-transcription.tsx b/enjoy/src/renderer/components/medias/media-transcription.tsx
index 3076420a..fa10dc5b 100644
--- a/enjoy/src/renderer/components/medias/media-transcription.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription.tsx
@@ -26,7 +26,7 @@ import { formatDuration } from "@renderer/lib/utils";
 import {
   MediaTranscriptionReadButton,
   MediaTranscriptionGenerateButton,
-  MediaTranscriptionDownload,
+  MediaTranscriptionPrint,
   TranscriptionEditButton,
 } from "@renderer/components";
 
@@ -165,7 +165,7 @@ export const MediaTranscription = (props: { display?: boolean }) => {
                   </TranscriptionEditButton>
                 </DropdownMenuItem>
                 <DropdownMenuItem asChild>
-                  <MediaTranscriptionDownload />
+                  <MediaTranscriptionPrint />
                 </DropdownMenuItem>
               </DropdownMenuContent>
             </DropdownMenu>
diff --git a/enjoy/src/renderer/components/notes/note-segment.tsx b/enjoy/src/renderer/components/notes/note-segment.tsx
index 266bd266..f2f1eb9d 100644
--- a/enjoy/src/renderer/components/notes/note-segment.tsx
+++ b/enjoy/src/renderer/components/notes/note-segment.tsx
@@ -1,4 +1,4 @@
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { useContext, useState } from "react";
 import { WavesurferPlayer } from "@/renderer/components/misc";
 import { AppSettingsProviderContext } from "@/renderer/context";
diff --git a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
index 80ff3b90..5beb3df3 100644
--- a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
+++ b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx
@@ -3,14 +3,11 @@ import {
   AppSettingsProviderContext,
 } from "@renderer/context";
 import { zodResolver } from "@hookform/resolvers/zod";
-import { useContext, useState } from "react";
+import { useContext } from "react";
 import { useForm } from "react-hook-form";
 import { z } from "zod";
 import {
   Button,
-  Collapsible,
-  CollapsibleContent,
-  CollapsibleTrigger,
   Form,
   FormDescription,
   FormField,
@@ -31,8 +28,9 @@ import {
 } from "@renderer/components/ui";
 import { t } from "i18next";
 import { LANGUAGES } from "@/constants";
-import { ChevronDownIcon, ChevronUpIcon, LoaderIcon } from "lucide-react";
+import { LoaderIcon } from "lucide-react";
 import { parseText } from "media-captions";
+import { milisecondsToTimestamp } from "@/utils";
 
 const transcriptionSchema = z.object({
   language: z.string(),
@@ -59,18 +57,28 @@ export const TranscriptionCreateForm = (props: {
   } = props;
   const { learningLanguage } = useContext(AppSettingsProviderContext);
   const { whisperConfig } = useContext(AISettingsProviderContext);
-  const [collapsibleOpen, setCollapsibleOpen] = useState(false);
 
   const form = useForm<z.infer<typeof transcriptionSchema>>({
     resolver: zodResolver(transcriptionSchema),
     values: {
       language: learningLanguage,
-      service: whisperConfig.service,
+      service: originalText ? "upload" : whisperConfig.service,
       text: originalText,
       isolate: false,
     },
   });
 
+  const handleSubmit = (data: z.infer<typeof transcriptionSchema>) => {
+    const { service, text } = data;
+
+    if (service === "upload" && !text) {
+      toast.error(t("pleaseUploadTranscriptFile"));
+      return;
+    }
+
+    onSubmit(data);
+  };
+
   const parseSubtitle = (file: File) => {
     const fileType = file.name.split(".").pop();
     return new Promise<string>((resolve, reject) => {
@@ -88,7 +96,16 @@ export const TranscriptionCreateForm = (props: {
         if (caption.cues.length === 0) {
           text = cleanSubtitleText(text as string);
         } else {
-          text = caption.cues.map((cue) => cue.text).join("\n");
+          // Write cues to text in SRT format
+          text = caption.cues
+            .map((cue, _) => {
+              return `${milisecondsToTimestamp(
+                cue.startTime * 1000
+              )} --> ${milisecondsToTimestamp(cue.endTime * 1000)}\n${
+                cue.text
+              }`;
+            })
+            .join("\n\n");
         }
 
         if (text.length === 0) {
@@ -126,7 +143,7 @@ export const TranscriptionCreateForm = (props: {
   return (
     <Form {...form}>
       <form
-        onSubmit={form.handleSubmit(onSubmit)}
+        onSubmit={form.handleSubmit(handleSubmit)}
         className="gap-4 grid w-full"
       >
         <FormField
@@ -150,8 +167,21 @@ export const TranscriptionCreateForm = (props: {
                     {t("cloudflareAi")}
                   </SelectItem>
                   <SelectItem value="openai">OpenAI</SelectItem>
+                  <SelectItem value="upload">{t("upload")}</SelectItem>
                 </SelectContent>
               </Select>
+              <FormDescription>
+                {form.watch("service") === "local" &&
+                  t("localSpeechToTextDescription")}
+                {form.watch("service") === "azure" &&
+                  t("azureSpeechToTextDescription")}
+                {form.watch("service") === "cloudflare" &&
+                  t("cloudflareSpeechToTextDescription")}
+                {form.watch("service") === "openai" &&
+                  t("openaiSpeechToTextDescription")}
+                {form.watch("service") === "upload" &&
+                  t("uploadSpeechToTextDescription")}
+              </FormDescription>
             </FormItem>
           )}
         />
@@ -181,16 +211,14 @@ export const TranscriptionCreateForm = (props: {
             </FormItem>
           )}
         />
-        <Collapsible open={collapsibleOpen} onOpenChange={setCollapsibleOpen}>
-          <CollapsibleContent className="space-y-4 mb-4">
+        {form.watch("service") === "upload" && (
+          <>
             <FormField
               control={form.control}
               name="text"
               render={({ field }) => (
                 <FormItem className="grid w-full items-center">
-                  <FormLabel>
-                    {t("uploadTranscriptFile")}({t("optinal")})
-                  </FormLabel>
+                  <FormLabel>{t("uploadTranscriptFile")}</FormLabel>
                   <Input
                     disabled={transcribing}
                     type="file"
@@ -245,25 +273,8 @@ export const TranscriptionCreateForm = (props: {
                 </FormItem>
               )}
             />
-          </CollapsibleContent>
-          <div className="flex justify-center">
-            <CollapsibleTrigger asChild>
-              <Button variant="ghost" size="sm">
-                {collapsibleOpen ? (
-                  <>
-                    <ChevronUpIcon className="h-4 w-4" />
-                    <span className="ml-2">{t("lessOptions")}</span>
-                  </>
-                ) : (
-                  <>
-                    <ChevronDownIcon className="h-4 w-4" />
-                    <span className="ml-2">{t("moreOptions")}</span>
-                  </>
-                )}
-              </Button>
-            </CollapsibleTrigger>
-          </div>
-        </Collapsible>
+          </>
+        )}
 
         <TranscribeProgress
           service={form.watch("service")}
diff --git a/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx b/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
index c6b32241..c2dc029b 100644
--- a/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
+++ b/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx
@@ -20,26 +20,36 @@ import {
   Textarea,
   toast,
 } from "@renderer/components/ui";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { t } from "i18next";
-import { useContext, useState } from "react";
+import { useContext, useEffect, useState } from "react";
 import { LoaderIcon } from "lucide-react";
+import { milisecondsToTimestamp } from "@/utils";
 
 export const TranscriptionEditButton = (props: {
   children?: React.ReactNode;
 }) => {
-  const [open, setOpen] = useState(false);
-  const [submiting, setSubmiting] = useState(false);
-  const { transcription, generateTranscription } = useContext(
+  const { media, transcription, generateTranscription } = useContext(
     MediaPlayerProviderContext
   );
+  const [open, setOpen] = useState(false);
+  const [submiting, setSubmiting] = useState(false);
   const [content, setContent] = useState<string>(
-    transcription.result.timeline.map((t: TimelineEntry) => t.text).join("\n\n")
+    // generate text in SRT format from timeline entries
+    transcription.result.timeline
+      .map(
+        (t: TimelineEntry) =>
+          `${milisecondsToTimestamp(
+            t.startTime * 1000
+          )} --> ${milisecondsToTimestamp(t.endTime * 1000)}\n${t.text}`
+      )
+      .join("\n\n")
   );
+  const [downloadUrl, setDownloadUrl] = useState<string>();
 
   const handleSave = async () => {
     setSubmiting(true);
-    generateTranscription({ originalText: content })
+    generateTranscription({ originalText: content, service: "upload" })
       .then(() => setOpen(false))
       .catch((e) => {
         toast.error(e.message);
@@ -47,6 +57,13 @@ export const TranscriptionEditButton = (props: {
       .finally(() => setSubmiting(false));
   };
 
+  useEffect(() => {
+    if (!content) return;
+
+    const blob = new Blob([content], { type: "text/html" });
+    setDownloadUrl(URL.createObjectURL(blob));
+  }, [content]);
+
   return (
     <Dialog open={open} onOpenChange={setOpen}>
       <DialogTrigger asChild>
@@ -76,6 +93,11 @@ export const TranscriptionEditButton = (props: {
               {t("cancel")}
             </Button>
           </DialogClose>
+          <DialogClose asChild>
+            <a download={`${media.name}.srt`} href={downloadUrl}>
+              <Button variant="secondary">{t("download")}</Button>
+            </a>
+          </DialogClose>
 
           <AlertDialog>
             <AlertDialogTrigger asChild>
diff --git a/enjoy/src/renderer/context/media-player-provider.tsx b/enjoy/src/renderer/context/media-player-provider.tsx
index ce2fe5e9..12c7656a 100644
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -68,7 +68,7 @@ type MediaPlayerContextType = {
   generateTranscription: (params?: {
     originalText?: string;
     language?: string;
-    service?: WhisperConfigType["service"];
+    service?: WhisperConfigType["service"] | "upload";
     isolate?: boolean;
   }) => Promise<void>;
   transcribing: boolean;
@@ -352,7 +352,7 @@ export const MediaPlayerProvider = ({
 
         let phones: TimelineEntry[] = [];
         words.forEach((word: TimelineEntry) => {
-          word.timeline.forEach((token: TimelineEntry) => {
+          word.timeline?.forEach((token: TimelineEntry) => {
             phones = phones.concat(token.timeline);
           });
         });
diff --git a/enjoy/src/renderer/hooks/use-transcribe.tsx b/enjoy/src/renderer/hooks/use-transcribe.tsx
index 89c0f82e..d2db50cf 100644
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -8,9 +8,92 @@ import { t } from "i18next";
 import { AI_WORKER_ENDPOINT } from "@/constants";
 import * as sdk from "microsoft-cognitiveservices-speech-sdk";
 import axios from "axios";
-import { AlignmentResult } from "echogarden/dist/api/API.d.js";
 import { useAiCommand } from "./use-ai-command";
 import { toast } from "@renderer/components/ui";
+import {
+  Timeline,
+  TimelineEntry,
+  type TimelineEntryType,
+} from "echogarden/dist/utilities/Timeline";
+import take from "lodash/take";
+import sortedUniqBy from "lodash/sortedUniqBy";
+import { parseText } from "media-captions";
+
+/*
+ * define the regex pattern to match the end of a sentence
+ * the end of a sentence is defined as a period, question mark, or exclamation mark
+ * also it may be followed by a quotation mark
+ * and exclude sepecial cases like "Mr.", "Mrs.", "Dr.", "Ms.", "etc."
+ */
+const sentenceEndPattern = /(?<!Mr|Mrs|Dr|Ms|etc)\.|\?|!\"?/;
+
+// test a text string has any punctuations or not
+// some transcribed text may not have any punctuations
+const punctuationsPattern = /\w[.,!?](\s|$)/g;
+
+/*
+ * convert the word timeline to sentence timeline
+ * a sentence is a group of words that ends with a punctuation
+ */
+const wordTimelineToSentenceTimeline = (
+  wordTimeline: TimelineEntry[]
+): TimelineEntry[] => {
+  const timeline: TimelineEntry[] = [];
+
+  wordTimeline.forEach((word, index) => {
+    word.text = word.text.trim();
+    // skip empty words
+    if (!word.text) return;
+    // skip music or sound effects quoted in []
+    if (word.text.match(/^\[.*\]$/)) return;
+
+    const wordEntry = {
+      type: "word" as TimelineEntryType,
+      text: word.text,
+      startTime: word.startTime,
+      endTime: word.endTime,
+    };
+
+    let sentence: TimelineEntry;
+    // get the last sentence in the timeline
+    if (timeline.length > 0) {
+      sentence = timeline[timeline.length - 1];
+    }
+
+    // if there is no sentence in the timeline, create a new sentence
+    // if last sentence is a punctuation, create a new sentence
+    if (!sentence || sentence.text.match(sentenceEndPattern)) {
+      sentence = {
+        type: "sentence" as TimelineEntryType,
+        text: "",
+        startTime: wordEntry.startTime,
+        endTime: wordEntry.endTime,
+        timeline: [],
+      };
+      timeline.push(sentence);
+    }
+
+    // if the word is a punctuation, add it to the sentence and start a new sentence
+    if (wordEntry.text.match(sentenceEndPattern)) {
+      sentence.text += wordEntry.text;
+      sentence.endTime = wordEntry.endTime;
+
+      const lastSentence = timeline[timeline.length - 1];
+      if (lastSentence.endTime !== sentence.endTime) {
+        timeline.push(sentence);
+      }
+    } else {
+      sentence.text += wordEntry.text + " ";
+      sentence.endTime = wordEntry.endTime;
+
+      if (index === wordTimeline.length - 1) {
+        timeline.push(sentence);
+      }
+    }
+  });
+
+  return timeline;
+};
 
 export const useTranscribe = () => {
   const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
@@ -37,13 +120,14 @@ export const useTranscribe = () => {
       targetType?: string;
       originalText?: string;
       language: string;
-      service: WhisperConfigType["service"];
+      service: WhisperConfigType["service"] | "upload";
       isolate?: boolean;
     }
   ): Promise<{
     engine: string;
     model: string;
-    alignmentResult: AlignmentResult;
+    transcript: string;
+    timeline: TimelineEntry[];
     originalText?: string;
     tokenId?: number;
   }> => {
@@ -58,67 +142,152 @@ export const useTranscribe = () => {
     } = params || {};
     const blob = await (await fetch(url)).blob();
 
-    let result;
-    if (originalText) {
-      result = {
-        engine: "original",
-        model: "original",
-      };
+    let result: any;
+    let timeline: Timeline = [];
+    if (service === "upload" && originalText) {
+      const caption = await parseText(originalText, { type: "srt" });
+      if (caption.cues.length > 0) {
+        timeline = caption.cues.map((cue) => {
+          return {
+            type: "sentence",
+            text: cue.text,
+            startTime: cue.startTime,
+            endTime: cue.endTime,
+            timeline: [],
+          };
+        });
+        result = {
+          engine: "upload",
+          model: "-",
+          text: timeline.map((entry) => entry.text).join(" "),
+          timeline,
+        };
+      } else {
+        result = {
+          engine: "upload",
+          model: "-",
+          text: originalText,
+        };
+      }
     } else if (service === "local") {
       result = await transcribeByLocal(url, language);
     } else if (service === "cloudflare") {
       result = await transcribeByCloudflareAi(blob);
     } else if (service === "openai") {
-      result = await transcribeByOpenAi(blob);
+      result = await transcribeByOpenAi(
+        new File([blob], "audio.mp3", { type: "audio/mp3" })
+      );
     } else if (service === "azure") {
-      result = await transcribeByAzureAi(blob, language, {
-        targetId,
-        targetType,
-      });
+      result = await transcribeByAzureAi(
+        new File([blob], "audio.wav", { type: "audio/wav" }),
+        language,
+        {
+          targetId,
+          targetType,
+        }
+      );
     } else {
       throw new Error(t("whisperServiceNotSupported"));
     }
+    let transcript = result.text;
 
-    setOutput(null);
+    /*
+     * if timeline is available and the transcript contains punctuations
+     * use `alignSegments` to align each sentence with the timeline
+     * otherwise, use `align` to align the whole transcript
+     * if the transcript does not contain any punctuation, use AI command to add punctuation
+     */
+    if (result.timeline?.length && transcript.match(punctuationsPattern)) {
+      timeline = [...result.timeline];
+      setOutput("Aligning the transcript...");
+      const wordTimeline = await EnjoyApp.echogarden.alignSegments(
+        new Uint8Array(await blob.arrayBuffer()),
+        timeline,
+        {
+          language,
+          isolate,
+        }
+      );
 
-    let transcript = originalText || result.text;
+      wordTimeline.forEach((word: TimelineEntry) => {
+        let sentence = timeline.find(
+          (entry) =>
+            word.startTime >= entry.startTime && word.endTime <= entry.endTime
+        );
 
-    // Remove all content inside `()`, `[]`, `{}` and trim the text
-    // remove all markdown formatting
-    transcript = transcript
-      .replace(/\(.*?\)/g, "")
-      .replace(/\[.*?\]/g, "")
-      .replace(/\{.*?\}/g, "")
-      .replace(/[*_`]/g, "")
-      .trim();
+        if (sentence) {
+          sentence.timeline.push(word);
+        }
+      });
 
-    // if the transcript does not contain any punctuation, use AI command to add punctuation
-    if (!transcript.match(/\w[.,!?](\s|$)/)) {
-      try {
-        transcript = await punctuateText(transcript);
-      } catch (err) {
-        toast.error(err.message);
-        console.warn(err.message);
+      /*
+       * the start time of a sentence should be the start time of the first word in the sentence
+       * the end time of a sentence should the end time of the last word in the sentence
+       */
+      // timeline.forEach((t) => {
+      //   if (t.timeline.length === 0) return;
+
+      //   t.startTime = t.timeline[0].startTime;
+      //   t.endTime = t.timeline[t.timeline.length - 1].endTime;
+      // });
+    } else {
+      // Remove all content inside `()`, `[]`, `{}` and trim the text
+      // remove all markdown formatting
+      transcript = transcript
+        .replace(/\(.*?\)/g, "")
+        .replace(/\[.*?\]/g, "")
+        .replace(/\{.*?\}/g, "")
+        .replace(/[*_`]/g, "")
+        .trim();
+
+      // if the transcript does not contain any punctuation, use AI command to add punctuation
+      if (!transcript.match(punctuationsPattern)) {
+        try {
+          transcript = await punctuateText(transcript);
+        } catch (err) {
+          toast.error(err.message);
+          console.warn(err.message);
+        }
       }
+
+      setOutput("Aligning the transcript...");
+      const alignmentResult = await EnjoyApp.echogarden.align(
+        new Uint8Array(await blob.arrayBuffer()),
+        transcript,
+        {
+          language,
+          isolate,
+        }
+      );
+
+      alignmentResult.timeline.forEach((t: TimelineEntry) => {
+        if (t.type === "sentence") {
+          timeline.push(t);
+        } else {
+          t.timeline.forEach((st) => {
+            timeline.push(st);
+          });
+        }
+      });
     }
 
-    const alignmentResult = await EnjoyApp.echogarden.align(
-      new Uint8Array(await blob.arrayBuffer()),
-      transcript,
-      {
-        language,
-        isolate,
-      }
-    );
-
     return {
       ...result,
       originalText,
-      alignmentResult,
+      transcript,
+      timeline,
     };
   };
 
-  const transcribeByLocal = async (url: string, language?: string) => {
+  const transcribeByLocal = async (
+    url: string,
+    language?: string
+  ): Promise<{
+    engine: string;
+    model: string;
+    text: string;
+    timeline: TimelineEntry[];
+  }> => {
     const res = await EnjoyApp.whisper.transcribe(
       {
         file: url,
@@ -130,14 +299,25 @@ export const useTranscribe = () => {
       }
     );
 
+    const wordTimeline: TimelineEntry[] = res.transcription.map((word) => {
+      return {
+        type: "word" as TimelineEntryType,
+        text: word.text,
+        startTime: word.offsets.from / 1000.0,
+        endTime: word.offsets.to / 1000.0,
+      };
+    });
+    const timeline = wordTimelineToSentenceTimeline(wordTimeline);
+
     return {
       engine: "whisper",
       model: res.model.type,
       text: res.transcription.map((segment) => segment.text).join(" "),
+      timeline,
     };
   };
 
-  const transcribeByOpenAi = async (blob: Blob) => {
+  const transcribeByOpenAi = async (file: File) => {
     if (!openai?.key) {
       throw new Error(t("openaiKeyRequired"));
     }
@@ -149,20 +329,58 @@ export const useTranscribe = () => {
       maxRetries: 0,
     });
 
-    const res: { text: string } = (await client.audio.transcriptions.create({
-      file: new File([blob], "audio.wav"),
+    const res: {
+      text: string;
+      words?: { word: string; start: number; end: number }[];
+      segments?: { text: string; start: number; end: number }[];
+    } = (await client.audio.transcriptions.create({
+      file,
       model: "whisper-1",
-      response_format: "json",
+      response_format: "verbose_json",
+      timestamp_granularities: ["word"],
     })) as any;
 
+    let timeline: TimelineEntry[] = [];
+    if (res.segments) {
+      res.segments.forEach((segment) => {
+        const segmentTimeline = {
+          type: "sentence" as TimelineEntryType,
+          text: segment.text,
+          startTime: segment.start,
+          endTime: segment.end,
+          timeline: [] as Timeline,
+        };
+
+        timeline.push(segmentTimeline);
+      });
+    } else if (res.words) {
+      const wordTimeline = res.words.map((word) => {
+        return {
+          type: "word" as TimelineEntryType,
+          text: word.word,
+          startTime: word.start,
+          endTime: word.end,
+        };
+      });
+      timeline = wordTimelineToSentenceTimeline(wordTimeline);
+    }
+
     return {
       engine: "openai",
       model: "whisper-1",
       text: res.text,
+      timeline,
     };
   };
 
-  const transcribeByCloudflareAi = async (blob: Blob) => {
+  const transcribeByCloudflareAi = async (
+    blob: Blob
+  ): Promise<{
+    engine: string;
+    model: string;
+    text: string;
+    timeline?: TimelineEntry[];
+  }> => {
     const res: CfWhipserOutputType = (
       await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, blob, {
         headers: {
@@ -172,15 +390,26 @@ export const useTranscribe = () => {
       })
     ).data;
 
+    const wordTimeline = res.words.map((word) => {
+      return {
+        type: "word" as TimelineEntryType,
+        text: word.word,
+        startTime: word.start,
+        endTime: word.end,
+      };
+    });
+    const timeline = wordTimelineToSentenceTimeline(wordTimeline);
+
     return {
       engine: "cloudflare",
       model: "@cf/openai/whisper",
       text: res.text,
+      timeline,
     };
   };
 
   const transcribeByAzureAi = async (
-    blob: Blob,
+    file: File,
     language: string,
     params?: {
       targetId?: string;
@@ -191,12 +420,11 @@ export const useTranscribe = () => {
     model: string;
     text: string;
     tokenId: number;
+    timeline?: TimelineEntry[];
   }> => {
     const { id, token, region } = await webApi.generateSpeechToken(params);
     const config = sdk.SpeechConfig.fromAuthorizationToken(token, region);
-    const audioConfig = sdk.AudioConfig.fromWavFileInput(
-      new File([blob], "audio.wav")
-    );
+    const audioConfig = sdk.AudioConfig.fromWavFileInput(file);
     // setting the recognition language to learning language, such as 'en-US'.
     config.speechRecognitionLanguage = language;
     config.requestWordLevelTimestamps();
@@ -209,7 +437,6 @@ export const useTranscribe = () => {
 
     return new Promise((resolve, reject) => {
       reco.recognizing = (_s, e) => {
-        console.log(e.result);
         setOutput(e.result.text);
       };
 
@@ -232,10 +459,40 @@ export const useTranscribe = () => {
       reco.sessionStopped = (_s, _e) => {
         reco.stopContinuousRecognitionAsync();
 
+        const wordTimeline: TimelineEntry[] = [];
+        results.forEach((result) => {
+          const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
+          const splitedWords = best.Display.trim().split(" ");
+
+          best.Words.forEach((word, index) => {
+            let text = word.Word;
+            if (splitedWords.length === best.Words.length) {
+              text = splitedWords[index];
+            }
+
+            if (
+              index === best.Words.length - 1 &&
+              !text.trim().match(sentenceEndPattern)
+            ) {
+              text = text + ".";
+            }
+
+            wordTimeline.push({
+              type: "word" as TimelineEntryType,
+              text,
+              startTime: word.Offset / 10000000.0,
+              endTime: (word.Offset + word.Duration) / 10000000.0,
+            });
+          });
+        });
+
+        const timeline = wordTimelineToSentenceTimeline(wordTimeline);
+
         resolve({
           engine: "azure",
           model: "whisper",
           text: results.map((result) => result.DisplayText).join(" "),
+          timeline,
           tokenId: id,
         });
       };
diff --git a/enjoy/src/renderer/hooks/use-transcriptions.tsx b/enjoy/src/renderer/hooks/use-transcriptions.tsx
index 5522734f..ba72684b 100644
--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -20,9 +20,9 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
   const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
   const [transcribing, setTranscribing] = useState<boolean>(false);
   const [transcribingOutput, setTranscribingOutput] = useState<string>("");
-  const [service, setService] = useState<WhisperConfigType["service"]>(
-    whisperConfig.service
-  );
+  const [service, setService] = useState<
+    WhisperConfigType["service"] | "upload"
+  >(whisperConfig.service);
 
   const onTransactionUpdate = (event: CustomEvent) => {
     if (!transcription) return;
@@ -63,7 +63,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
   const generateTranscription = async (params?: {
     originalText?: string;
     language?: string;
-    service?: WhisperConfigType["service"];
+    service?: WhisperConfigType["service"] | "upload";
     isolate?: boolean;
   }) => {
     let {
@@ -87,7 +87,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
           }
         }
       }
-      const { engine, model, alignmentResult, tokenId } = await transcribe(
+      const { engine, model, transcript, timeline, tokenId } = await transcribe(
         media.src,
         {
           targetId: media.id,
@@ -99,18 +99,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
         }
       );
 
-      let timeline: TimelineEntry[] = [];
-      alignmentResult.timeline.forEach((t) => {
-        if (t.type === "sentence") {
-          timeline.push(t);
-        } else {
-          t.timeline.forEach((st) => {
-            timeline.push(st);
-          });
-        }
-      });
-
-      timeline = preProcessTranscription(timeline);
+      const processedTimeline = preProcessTranscription(timeline);
       if (media.language !== language) {
         if (media.mediaType === "Video") {
           await EnjoyApp.videos.update(media.id, {
@@ -126,8 +115,8 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
       await EnjoyApp.transcriptions.update(transcription.id, {
         state: "finished",
         result: {
-          timeline: timeline,
-          transcript: alignmentResult.transcript,
+          timeline: processedTimeline,
+          transcript,
           originalText,
           tokenId,
         },
diff --git a/enjoy/src/types/enjoy-app.d.ts b/enjoy/src/types/enjoy-app.d.ts
index 8d4467ad..afccd064 100644
--- a/enjoy/src/types/enjoy-app.d.ts
+++ b/enjoy/src/types/enjoy-app.d.ts
@@ -252,6 +252,11 @@ type EnjoyAppType = {
       transcript: string,
       options?: any
     ) => Promise<AlignmentResult>;
+    alignSegments: (
+      input: string | Uint8Array,
+      timeline: Timeline,
+      options?: any
+    ) => Promise<Timeline>;
     transcode: (input: string) => Promise<string>;
     check: () => Promise<boolean>;
   };
diff --git a/enjoy/src/utils.ts b/enjoy/src/utils.ts
index 64ead8c2..dfac4d7d 100644
--- a/enjoy/src/utils.ts
+++ b/enjoy/src/utils.ts
@@ -49,7 +49,7 @@ export function milisecondsToTimestamp(ms: number) {
   const hours = Math.floor(ms / 3600000).toString();
   const minutes = Math.floor((ms % 3600000) / 60000).toString();
   const seconds = Math.floor(((ms % 360000) % 60000) / 1000).toString();
-  const milliseconds = Math.floor(((ms % 360000) % 60000) % 1000).toString();
+  const milliseconds = Math.round(((ms % 360000) % 60000) % 1000).toString();
   return `${hours.padStart(2, "0")}:${minutes.padStart(
     2,
     "0"