diff --git a/enjoy/src/i18n/en.json b/enjoy/src/i18n/en.json
index b9c8e21a..b67f0c1b 100644
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -567,5 +567,8 @@
   "areYouSureToDeleteThisNote": "Are you sure to delete this note?",
   "notesCount": "{{count}} notes",
   "source": "source",
-  "noNotesYet": "No notes yet"
+  "noNotesYet": "No notes yet",
+  "editTranscription": "Edit transcription",
+  "saveTranscription": "Save transcription",
+  "areYouSureToSaveTranscription": "It will perform a force-alignment between the audio and your edited transcription. Are you sure to continue?"
 }
diff --git a/enjoy/src/i18n/zh-CN.json b/enjoy/src/i18n/zh-CN.json
index 7c0de682..d554cfd4 100644
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -566,5 +566,8 @@
   "areYouSureToDeleteThisNote": "您确定要删除这条笔记吗？",
   "notesCount": "{{count}} 条笔记",
   "source": "来源",
-  "noNotesYet": "还没有笔记"
+  "noNotesYet": "还没有笔记",
+  "editTranscription": "编辑语音文本",
+  "saveTranscription": "保存语音文本",
+  "areYouSureToSaveTranscription": "即将根据您修改后的语音文本对语音重新进行对齐，确定要继续吗？"
 }
diff --git a/enjoy/src/renderer/components/medias/index.ts b/enjoy/src/renderer/components/medias/index.ts
index ba93a3a5..968448f9 100644
--- a/enjoy/src/renderer/components/medias/index.ts
+++ b/enjoy/src/renderer/components/medias/index.ts
@@ -5,6 +5,7 @@ export * from "./media-recordings";
 export * from "./media-current-recording";
 export * from "./media-recorder";
 export * from "./media-transcription";
+export * from "./media-transcription-form";
 export * from "./media-player";
 export * from "./media-provider";
 export * from "./media-tabs";
diff --git a/enjoy/src/renderer/components/medias/media-transcription-form.tsx b/enjoy/src/renderer/components/medias/media-transcription-form.tsx
new file mode 100644
index 00000000..cf18e028
--- /dev/null
+++ b/enjoy/src/renderer/components/medias/media-transcription-form.tsx
@@ -0,0 +1,121 @@
+import { MediaPlayerProviderContext } from "@renderer/context";
+import {
+  AlertDialog,
+  AlertDialogAction,
+  AlertDialogCancel,
+  AlertDialogContent,
+  AlertDialogDescription,
+  AlertDialogFooter,
+  AlertDialogHeader,
+  AlertDialogTitle,
+  AlertDialogTrigger,
+  Button,
+  Dialog,
+  DialogClose,
+  DialogContent,
+  DialogFooter,
+  DialogHeader,
+  DialogTitle,
+  DialogTrigger,
+  Textarea,
+  toast,
+} from "@renderer/components/ui";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
+import { t } from "i18next";
+import { useContext, useState } from "react";
+import { LoaderIcon } from "lucide-react";
+
+export const MediaTranscriptionForm = () => {
+  const [open, setOpen] = useState(false);
+
+  return (
+    <Dialog open={open} onOpenChange={setOpen}>
+      <DialogTrigger asChild>
+        <Button variant="outline" size="sm">
+          <span className="capitalize">{t("edit")}</span>
+        </Button>
+      </DialogTrigger>
+      <DialogContent className="max-w-screen-sm xl:max-w-screen-md">
+        <TranscriptionForm setOpen={setOpen} />
+      </DialogContent>
+    </Dialog>
+  );
+};
+
+export const TranscriptionForm = (props: {
+  setOpen: (value: boolean) => void;
+}) => {
+  const { setOpen } = props;
+  const [submiting, setSubmiting] = useState(false);
+  const { transcription, generateTranscription } = useContext(
+    MediaPlayerProviderContext
+  );
+  const [content, setContent] = useState<string>(
+    transcription.result.timeline.map((t: TimelineEntry) => t.text).join("\n\n")
+  );
+
+  const handleSave = async () => {
+    setSubmiting(true);
+    try {
+      await generateTranscription(content);
+      setOpen(false);
+    } catch (e) {
+      toast.error(e.message);
+    }
+
+    setSubmiting(false);
+  };
+
+  return (
+    <>
+      <DialogHeader>
+        <DialogTitle>{t("editTranscription")}</DialogTitle>
+      </DialogHeader>
+      <div>
+        <Textarea
+          disabled={submiting}
+          className="h-96 text-lg font-serif resize-none"
+          value={content}
+          onChange={(e) => setContent(e.target.value)}
+        />
+      </div>
+      <DialogFooter>
+        <DialogClose asChild>
+          <Button disabled={submiting} variant="secondary">
+            {t("cancel")}
+          </Button>
+        </DialogClose>
+
+        <AlertDialog>
+          <AlertDialogTrigger asChild>
+            <Button disabled={submiting}>
+              {submiting && <LoaderIcon className="animate-spin w-4 mr-2" />}
+              {t("save")}
+            </Button>
+          </AlertDialogTrigger>
+          <AlertDialogContent>
+            <AlertDialogHeader>
+              <AlertDialogTitle>{t("saveTranscription")}</AlertDialogTitle>
+              <AlertDialogDescription>
+                {t("areYouSureToSaveTranscription")}
+              </AlertDialogDescription>
+            </AlertDialogHeader>
+            <AlertDialogFooter>
+              <AlertDialogCancel disabled={submiting}>
+                {t("cancel")}
+              </AlertDialogCancel>
+              <AlertDialogAction asChild>
+                <Button disabled={submiting} onClick={handleSave}>
+                  {submiting && (
+                    <LoaderIcon className="animate-spin w-4 mr-2" />
+                  )}
+                  {t("save")}
+                </Button>
+              </AlertDialogAction>
+            </AlertDialogFooter>
+          </AlertDialogContent>
+        </AlertDialog>
+      </DialogFooter>
+    </>
+  );
+};
diff --git a/enjoy/src/renderer/components/medias/media-transcription.tsx b/enjoy/src/renderer/components/medias/media-transcription.tsx
index c3f1cf0d..da8a94fa 100644
--- a/enjoy/src/renderer/components/medias/media-transcription.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription.tsx
@@ -26,6 +26,7 @@ import {
 } from "lucide-react";
 import { AlignmentResult } from "echogarden/dist/api/API.d.js";
 import { formatDuration } from "@renderer/lib/utils";
+import { MediaTranscriptionForm } from "./media-transcription-form";
 
 export const MediaTranscription = () => {
   const containerRef = useRef<HTMLDivElement>();
@@ -113,37 +114,42 @@ export const MediaTranscription = () => {
             )}
             <span className="capitalize">{t("transcript")}</span>
           </div>
-          <AlertDialog>
-            <AlertDialogTrigger asChild>
-              <Button
-                variant="outline"
-                size="sm"
-                disabled={transcribing || transcription.state === "processing"}
-                className="capitalize"
-              >
-                {(transcribing || transcription.state === "processing") && (
-                  <LoaderIcon className="animate-spin w-4 mr-2" />
-                )}
-                {transcription.result ? t("regenerate") : t("transcribe")}
-              </Button>
-            </AlertDialogTrigger>
-            <AlertDialogContent>
-              <AlertDialogHeader>
-                <AlertDialogTitle>{t("transcribe")}</AlertDialogTitle>
-                <AlertDialogDescription>
-                  {t("transcribeMediaConfirmation", {
-                    name: media.name,
-                  })}
-                </AlertDialogDescription>
-              </AlertDialogHeader>
-              <AlertDialogFooter>
-                <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
-                <AlertDialogAction onClick={generateTranscription}>
-                  {t("transcribe")}
-                </AlertDialogAction>
-              </AlertDialogFooter>
-            </AlertDialogContent>
-          </AlertDialog>
+          <div className="flex space-x-2">
+            <AlertDialog>
+              <AlertDialogTrigger asChild>
+                <Button
+                  variant="outline"
+                  size="sm"
+                  disabled={
+                    transcribing || transcription.state === "processing"
+                  }
+                  className="capitalize"
+                >
+                  {(transcribing || transcription.state === "processing") && (
+                    <LoaderIcon className="animate-spin w-4 mr-2" />
+                  )}
+                  {transcription.result ? t("regenerate") : t("transcribe")}
+                </Button>
+              </AlertDialogTrigger>
+              <AlertDialogContent>
+                <AlertDialogHeader>
+                  <AlertDialogTitle>{t("transcribe")}</AlertDialogTitle>
+                  <AlertDialogDescription>
+                    {t("transcribeMediaConfirmation", {
+                      name: media.name,
+                    })}
+                  </AlertDialogDescription>
+                </AlertDialogHeader>
+                <AlertDialogFooter>
+                  <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
+                  <AlertDialogAction onClick={() => generateTranscription("")}>
+                    {t("transcribe")}
+                  </AlertDialogAction>
+                </AlertDialogFooter>
+              </AlertDialogContent>
+            </AlertDialog>
+            <MediaTranscriptionForm />
+          </div>
         </div>
       </div>
 
diff --git a/enjoy/src/renderer/components/ui/dialog.tsx b/enjoy/src/renderer/components/ui/dialog.tsx
index 87fa1322..50858919 100644
--- a/enjoy/src/renderer/components/ui/dialog.tsx
+++ b/enjoy/src/renderer/components/ui/dialog.tsx
@@ -44,7 +44,7 @@ const DialogContent = React.forwardRef<
       {...props}
     >
       {children}
-      <DialogPrimitive.Close className="absolute right-4 top-4 rounded-sm opacity-70 ring-offset-background transition-opacity hover:opacity-100 focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2 disabled:pointer-events-none data-[state=open]:bg-accent data-[state=open]:text-muted-foreground">
+      <DialogPrimitive.Close className="absolute right-4 top-4 rounded-sm opacity-70 ring-offset-background transition-opacity hover:opacity-100 disabled:pointer-events-none data-[state=open]:bg-accent data-[state=open]:text-muted-foreground">
         <Cross2Icon className="h-4 w-4" />
         <span className="sr-only">Close</span>
       </DialogPrimitive.Close>
diff --git a/enjoy/src/renderer/context/media-player-provider.tsx b/enjoy/src/renderer/context/media-player-provider.tsx
index 66a4c6cf..ee268947 100644
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -66,7 +66,7 @@ type MediaPlayerContextType = {
   pitchChart: Chart;
   // Transcription
   transcription: TranscriptionType;
-  generateTranscription: () => void;
+  generateTranscription: (text?: string) => void;
   transcribing: boolean;
   transcribingProgress: number;
   transcriptionDraft: TranscriptionType["result"];
diff --git a/enjoy/src/renderer/hooks/use-transcriptions.tsx b/enjoy/src/renderer/hooks/use-transcriptions.tsx
index 7f28c9df..fe236ff3 100644
--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -8,6 +8,7 @@ import {
 import { toast } from "@renderer/components/ui";
 import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { MAGIC_TOKEN_REGEX, END_OF_SENTENCE_REGEX } from "@/constants";
+import { ca } from "@vidstack/react/types/vidstack-react";
 
 export const useTranscriptions = (media: AudioType | VideoType) => {
   const { whisperConfig } = useContext(AISettingsProviderContext);
@@ -52,14 +53,15 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
         });
     };
 
-  const generateTranscription = async () => {
-    let originalText: string;
-    if (transcription?.targetId === media.id) {
-      originalText = transcription.result?.originalText;
-    } else {
-      const r = await findOrCreateTranscription();
-      if (r) {
-        originalText = r.result?.originalText;
+  const generateTranscription = async (originalText?: string) => {
+    if (originalText === undefined) {
+      if (transcription?.targetId === media.id) {
+        originalText = transcription.result?.originalText;
+      } else {
+        const r = await findOrCreateTranscription();
+        if (r) {
+          originalText = r.result?.originalText;
+        }
       }
     }
 
@@ -87,65 +89,72 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
        * Pre-process
        * 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc
        * 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display;
-       * 3. Some numbers with `%` are split into `number + percent` in words timeline, merge thme for display;
+       * 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display;
        */
-      timeline.forEach((sentence, i) => {
-        const nextSentence = timeline[i + 1];
-        if (
-          !sentence.text
-            .replaceAll(MAGIC_TOKEN_REGEX, "")
-            .match(END_OF_SENTENCE_REGEX) &&
-          nextSentence?.text
-        ) {
-          nextSentence.text = [sentence.text, nextSentence.text].join(" ");
-          nextSentence.timeline = [
-            ...sentence.timeline,
-            ...nextSentence.timeline,
-          ];
-          nextSentence.startTime = sentence.startTime;
-          timeline.splice(i, 1);
-        } else {
-          const words = sentence.text.split(" ");
+      try {
+        timeline.forEach((sentence, i) => {
+          const nextSentence = timeline[i + 1];
+          if (
+            !sentence.text
+              .replaceAll(MAGIC_TOKEN_REGEX, "")
+              .match(END_OF_SENTENCE_REGEX) &&
+            nextSentence?.text
+          ) {
+            nextSentence.text = [sentence.text, nextSentence.text].join(" ");
+            nextSentence.timeline = [
+              ...sentence.timeline,
+              ...nextSentence.timeline,
+            ];
+            nextSentence.startTime = sentence.startTime;
+            timeline.splice(i, 1);
+          } else {
+            const words = sentence.text.split(" ");
 
-          sentence.timeline.forEach((token, j) => {
-            const word = words[j]?.trim()?.toLowerCase();
+            sentence.timeline.forEach((token, j) => {
+              const word = words[j]?.trim()?.toLowerCase();
 
-            const match = word?.match(/-|%/);
-            if (!match) return;
+              const match = word?.match(/-|%/);
+              if (!match) return;
 
-            if (word === '-' && token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase()) {
-              sentence.timeline.splice(j, 0, {
-                type: 'token',
-                text: '-',
-                startTime: sentence.timeline[j - 1]?.endTime || 0,
-                endTime: sentence.timeline[j - 1]?.endTime || 0,
-                timeline: [],
-              })
-              return;
-            }
-
-            for (let k = j + 1; k <= sentence.timeline.length - 1; k++) {
-              if (word.includes(sentence.timeline[k].text.toLowerCase())) {
-                let connector = "";
-                if (match[0] === "-") {
-                  connector = "-";
-                }
-                token.text = [token.text, sentence.timeline[k].text].join(
-                  connector
-                );
-                token.timeline = [
-                  ...token.timeline,
-                  ...sentence.timeline[k].timeline,
-                ];
-                token.endTime = sentence.timeline[k].endTime;
-                sentence.timeline.splice(k, 1);
-              } else {
-                break;
+              if (
+                word === "-" &&
+                token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase()
+              ) {
+                sentence.timeline.splice(j, 0, {
+                  type: "token",
+                  text: "-",
+                  startTime: sentence.timeline[j - 1]?.endTime || 0,
+                  endTime: sentence.timeline[j - 1]?.endTime || 0,
+                  timeline: [],
+                });
+                return;
               }
-            }
-          });
-        }
-      });
+
+              for (let k = j + 1; k <= sentence.timeline.length - 1; k++) {
+                if (word.includes(sentence.timeline[k].text.toLowerCase())) {
+                  let connector = "";
+                  if (match[0] === "-") {
+                    connector = "-";
+                  }
+                  token.text = [token.text, sentence.timeline[k].text].join(
+                    connector
+                  );
+                  token.timeline = [
+                    ...token.timeline,
+                    ...sentence.timeline[k].timeline,
+                  ];
+                  token.endTime = sentence.timeline[k].endTime;
+                  sentence.timeline.splice(k, 1);
+                } else {
+                  break;
+                }
+              }
+            });
+          }
+        });
+      } catch (err) {
+        console.error(err);
+      }
 
       await EnjoyApp.transcriptions.update(transcription.id, {
         state: "finished",