Transcription force alignment & more (#416)

* add wavesurfer-provider * brand new layout for player * refactor pitch contour * clean up * update styl * refactor * update layout * use new layout for video * refactor * may select word * may edit word timestamp * may toggle multiselect words * clean code * improve word region update * improve layout * update layout * add echogarden * fix test * use aligned transcription * fix ipa * some refactor * improve code * implement ipa & translate & lookup * recording play & share * fix * fix post audio * improve layout * may delete recording * may record * fix video player layout * fix player in conversation * render recording along with orignal audio * may custom create region in recording * fix float issue when seekTo * fix recording player * fix load more recordings * fix seekTo * clean up * refactor pitch contour * fix some warnings * upgrade deps * fix group transcription sentence * zoom to fit when segment update * add more hotkeys * update player layout * improve style * play recording overlap audio when comparing * update echogarden dep * add recorded mark on transcription * fix recording pitch contour rendering * improve recording * adjust pitch finder params
2024-03-16 19:42:37 +08:00
parent fe43755e02
commit 90f38e9226
67 changed files with 6898 additions and 2643 deletions
--- a/1000-hours/package.json
+++ b/1000-hours/package.json
@@ -7,9 +7,9 @@
    "markdown-it-mathjax3": "^4.3.2",
    "markdown-it-sub": "^2.0.0",
    "markdown-it-sup": "^2.0.0",
-    "mermaid": "^10.8.0",
-    "sass": "^1.71.1",
-    "vitepress": "^1.0.0-rc.42",
+    "mermaid": "^10.9.0",
+    "sass": "^1.72.0",
+    "vitepress": "^1.0.0-rc.45",
    "vitepress-plugin-mermaid": "^2.0.16",
    "vue": "^3.4.21"
  },
--- a/enjoy/e2e/main.spec.ts
+++ b/enjoy/e2e/main.spec.ts
@@ -78,6 +78,16 @@ test("valid ffmpeg command", async () => {
  expect(res).toBeTruthy();
 });

+test("validate echogarden align command", async () => {
+  const res = await page.evaluate(() => {
+    return window.__ENJOY_APP__.echogarden.check();
+  });
+  expect(res).toBeTruthy();
+
+  const settings = fs.readJsonSync(path.join(resultDir, "settings.json"));
+  expect(settings.whisper.service).toBe("local");
+});
+
 test("should setup default library path", async () => {
  const settings = fs.readJsonSync(path.join(resultDir, "settings.json"));
  expect(settings.library).not.toBeNull();
--- a/enjoy/e2e/renderer.spec.ts
+++ b/enjoy/e2e/renderer.spec.ts
@@ -122,9 +122,39 @@ test.describe("with login", async () => {
          },
        });
      });
+    });

+    /*
+     * steps:
+     * 1. create a tts conversation
+     * 2. submit a message to the conversation
+     * 3. the speech should auto create
+     */
+    test("tts conversation", async () => {
      // navigate to the conversations page
      await page.getByTestId("sidebar-conversations").click();
+
+      // trigger new conversation modal
+      await page.getByTestId("conversation-new-button").click();
+
+      // create a tts conversation
+      await page.click("[data-testid=conversation-preset-tts]");
+      await page.getByTestId("conversation-form").waitFor();
+      await page.click("[data-testid=conversation-form-submit]");
+
+      // wait for the conversation to be created
+      await page.getByTestId("conversation-page").waitFor();
+
+      // submit a message to the conversation
+      await page.getByTestId("conversation-page-input").fill("How are you?");
+      await page.getByTestId("conversation-page-submit").click();
+      await page.locator(".ai-message").waitFor();
+      const player = page
+        .locator(".ai-message")
+        .getByTestId("wavesurfer-container");
+      await player.waitFor();
+
+      expect(await player.isVisible()).toBeTruthy();
    });

    /*
@@ -136,6 +166,9 @@ test.describe("with login", async () => {
     * 5. audio waveform player should be visible and transcription should be generated
     */
    test("gpt conversation", async () => {
+      // navigate to the conversations page
+      await page.getByTestId("sidebar-conversations").click();
+
      // trigger new conversation modal
      await page.getByTestId("conversation-new-button").click();

@@ -166,43 +199,12 @@ test.describe("with login", async () => {

      // add to library
      await page.getByTestId("message-start-shadow").click();
-      await page.getByTestId("audio-detail").waitFor();
+      await page.getByTestId("audio-player").waitFor();
      await page.getByTestId("media-player-container").waitFor();
-      await page.getByTestId("media-transcription").waitFor();
      await page.getByTestId("media-transcription-result").waitFor();
      expect(
        await page.getByTestId("media-transcription-result").isVisible()
      ).toBeTruthy();
    });
-
-    /*
-     * steps:
-     * 1. create a tts conversation
-     * 2. submit a message to the conversation
-     * 3. the speech should auto create
-     */
-    test("tts conversation", async () => {
-      // trigger new conversation modal
-      await page.getByTestId("conversation-new-button").click();
-
-      // create a tts conversation
-      await page.click("[data-testid=conversation-preset-tts]");
-      await page.getByTestId("conversation-form").waitFor();
-      await page.click("[data-testid=conversation-form-submit]");
-
-      // wait for the conversation to be created
-      await page.getByTestId("conversation-page").waitFor();
-
-      // submit a message to the conversation
-      await page.getByTestId("conversation-page-input").fill("How are you?");
-      await page.getByTestId("conversation-page-submit").click();
-      await page.locator(".ai-message").waitFor();
-      const player = page
-        .locator(".ai-message")
-        .getByTestId("wavesurfer-container");
-      await player.waitFor();
-
-      expect(await player.isVisible()).toBeTruthy();
-    });
  });
 });
--- a/enjoy/forge.config.js
+++ b/enjoy/forge.config.js
@@ -12,7 +12,7 @@ const config = {
    asar: {
      // Binary files won't work in asar, so we need to unpack them
      unpackDir:
-        "{.vite/build/lib,.vite/build/samples,node_modules/ffmpeg-static,node_modules/@andrkrn/ffprobe-static}",
+        "{.vite/build/lib,.vite/build/samples,node_modules/ffmpeg-static,node_modules/@andrkrn/ffprobe-static,node_modules/onnxruntime-node/bin}",
    },
    icon: "./assets/icon",
    name: "Enjoy",
--- a/enjoy/package.json
+++ b/enjoy/package.json
@@ -47,18 +47,18 @@
    "@types/fluent-ffmpeg": "^2.1.24",
    "@types/html-to-text": "^9.0.4",
    "@types/intl-tel-input": "^18.1.4",
-    "@types/lodash": "^4.14.202",
+    "@types/lodash": "^4.17.0",
    "@types/mark.js": "^8.11.12",
-    "@types/node": "^20.11.24",
-    "@types/react": "^18.2.62",
-    "@types/react-dom": "^18.2.19",
+    "@types/node": "^20.11.27",
+    "@types/react": "^18.2.66",
+    "@types/react-dom": "^18.2.22",
    "@types/validator": "^13.11.9",
    "@types/wavesurfer.js": "^6.0.12",
-    "@typescript-eslint/eslint-plugin": "^7.1.1",
-    "@typescript-eslint/parser": "^7.1.1",
+    "@typescript-eslint/eslint-plugin": "^7.2.0",
+    "@typescript-eslint/parser": "^7.2.0",
    "@vitejs/plugin-react": "^4.2.1",
    "autoprefixer": "^10.4.18",
-    "electron": "^29.1.0",
+    "electron": "^29.1.4",
    "electron-playwright-helpers": "^1.7.1",
    "eslint": "^8.57.0",
    "eslint-import-resolver-typescript": "^3.6.1",
@@ -67,12 +67,13 @@
    "octokit": "^3.1.2",
    "progress": "^2.0.3",
    "tailwind-merge": "^2.2.1",
+    "tailwind-scrollbar": "^3.1.0",
    "tailwindcss": "^3.4.1",
    "tailwindcss-animate": "^1.0.7",
    "ts-node": "^10.9.2",
    "tslib": "^2.6.2",
-    "typescript": "^5.3.3",
-    "vite": "^5.1.5",
+    "typescript": "^5.4.2",
+    "vite": "^5.1.6",
    "vite-plugin-static-copy": "^1.0.1",
    "zx": "^7.2.3"
  },
@@ -81,7 +82,7 @@
    "@ffmpeg/ffmpeg": "^0.12.10",
    "@ffmpeg/util": "^0.12.1",
    "@hookform/resolvers": "^3.3.4",
-    "@langchain/community": "^0.0.34",
+    "@langchain/community": "^0.0.39",
    "@langchain/google-genai": "^0.0.10",
    "@mozilla/readability": "^0.5.0",
    "@radix-ui/react-accordion": "^1.1.2",
@@ -112,6 +113,7 @@
    "axios": "^1.6.7",
    "camelcase": "^8.0.0",
    "camelcase-keys": "^9.1.3",
+    "chart.js": "^4.4.2",
    "cheerio": "^1.0.0-rc.12",
    "class-variance-authority": "^0.7.0",
    "clsx": "^2.1.0",
@@ -122,7 +124,8 @@
    "dayjs": "^1.11.10",
    "decamelize": "^6.0.0",
    "decamelize-keys": "^2.0.1",
-    "electron-log": "^5.1.1",
+    "echogarden": "https://github.com/an-lee/echogarden",
+    "electron-log": "^5.1.2",
    "electron-settings": "^4.0.2",
    "electron-squirrel-startup": "^1.0.0",
    "ffmpeg-static": "^5.2.0",
@@ -130,27 +133,27 @@
    "fs-extra": "^11.2.0",
    "html-to-text": "^9.0.5",
    "https-proxy-agent": "^7.0.4",
-    "i18next": "^23.10.0",
-    "intl-tel-input": "^19.5.5",
+    "i18next": "^23.10.1",
+    "intl-tel-input": "^19.5.7",
    "js-md5": "^0.8.3",
-    "langchain": "^0.1.25",
+    "langchain": "^0.1.28",
    "lodash": "^4.17.21",
-    "lucide-react": "^0.344.0",
+    "lucide-react": "^0.358.0",
    "mark.js": "^8.11.1",
-    "microsoft-cognitiveservices-speech-sdk": "^1.35.0",
-    "next-themes": "^0.2.1",
-    "openai": "^4.28.4",
+    "microsoft-cognitiveservices-speech-sdk": "^1.36.0",
+    "next-themes": "^0.3.0",
+    "openai": "^4.29.0",
    "pitchfinder": "^2.3.2",
    "postcss": "^8.4.35",
    "proxy-agent": "^6.4.0",
    "react": "^18.2.0",
-    "react-activity-calendar": "^2.2.7",
+    "react-activity-calendar": "^2.2.8",
    "react-dom": "^18.2.0",
    "react-hook-form": "^7.51.0",
    "react-hotkeys-hook": "^4.5.0",
-    "react-i18next": "^14.0.5",
+    "react-i18next": "^14.1.0",
    "react-markdown": "^9.0.1",
-    "react-router-dom": "^6.22.2",
+    "react-router-dom": "^6.22.3",
    "react-tooltip": "^5.26.3",
    "reflect-metadata": "^0.2.1",
    "rimraf": "^5.0.5",
@@ -160,7 +163,7 @@
    "sqlite3": "^5.1.7",
    "tailwind-scrollbar-hide": "^1.1.7",
    "umzug": "^3.7.0",
-    "wavesurfer.js": "^7.7.3",
+    "wavesurfer.js": "^7.7.5",
    "zod": "^3.22.4"
  }
 }
--- a/enjoy/playwright.config.ts
+++ b/enjoy/playwright.config.ts
@@ -18,7 +18,7 @@ export default defineConfig({
  /* Retry on CI only */
  retries: process.env.CI ? 2 : 0,
  /* Opt out of parallel tests on CI. */
-  workers: process.env.CI ? 1 : undefined,
+  workers: 1,
  /* Reporter to use. See https://playwright.dev/docs/test-reporters */
  reporter: "html",
  /* Shared settings for all the projects below. See https://playwright.dev/docs/api/class-testoptions. */
--- a/enjoy/src/constants.ts
+++ b/enjoy/src/constants.ts
@@ -7,6 +7,24 @@ export const WEB_API_URL = "https://enjoy-web.fly.dev";

 export const REPO_URL = "https://github.com/xiaolai/everyone-can-use-english";

+export const MAGIC_TOKEN_REGEX =
+  /\b(Mrs|Ms|Mr|Dr|Prof|St|[a-zA-Z]{1,2}|\d{1,2})\.\b/g;
+export const END_OF_SENTENCE_REGEX = /[^\.!,\?][\.!\?]/g;
+
+export const FFMPEG_TRIM_SILENCE_OPTIONS = [
+  "-af",
+  "silenceremove=1:start_duration=1:start_threshold=-50dB:detection=peak,aformat=dblp,areverse,silenceremove=start_periods=1:start_duration=1:start_threshold=-50dB:detection=peak,aformat=dblp,areverse",
+];
+
+export const FFMPEG_CONVERT_WAV_OPTIONS = [
+  "-ar",
+  "16000",
+  "-ac",
+  "1",
+  "-c:a",
+  "pcm_s16le",
+];
+
 // https://huggingface.co/ggerganov/whisper.cpp/tree/main
 export const WHISPER_MODELS_OPTIONS = [
  {
@@ -344,3 +362,133 @@ export const CONVERSATION_PRESETS = [
    },
  },
 ];
+
+export const IPA_MAPPING = {
+  p: "p",
+  b: "b",
+  t: "t",
+  d: "d",
+  ʈ: "t",
+  ɖ: "d",
+  c: "k",
+  ɟ: "g",
+  k: "k",
+  g: "g",
+  q: "k",
+  ɢ: "g",
+  ʔ: "",
+  ɡ: "g",
+  m: "m",
+  ɱ: "m",
+  n: "n",
+  ɳ: "n",
+  ɲ: "j",
+  ŋ: "ŋ",
+  ɴ: "ŋ",
+  n̩: "n",
+  ʙ: "r",
+  r: "r",
+  ʀ: "r",
+  ⱱ: "",
+  ɾ: "r",
+  ɽ: "r",
+  ɸ: "f",
+  β: "v",
+  f: "f",
+  v: "v",
+  θ: "θ",
+  ð: "ð",
+  s: "s",
+  z: "z",
+  ʃ: "ʃ",
+  ʒ: "ʒ",
+  ʂ: "s",
+  ʐ: "z",
+  ç: "",
+  ʝ: "j",
+  x: "h",
+  ɣ: "g",
+  χ: "h",
+  ʁ: "r",
+  ħ: "h",
+  ʕ: "",
+  h: "h",
+  ɦ: "h",
+  ɬ: "",
+  ɮ: "",
+  tʃ: "tʃ",
+  ʈʃ: "tʃ",
+  dʒ: "dʒ",
+  ʋ: "v",
+  ɹ: "r",
+  ɻ: "r",
+  j: "j",
+  ɰ: "w",
+  w: "w",
+  l: "l",
+  ɭ: "l",
+  ʎ: "j",
+  ʟ: "l",
+  i: "iː",
+  yɨ: "iː",
+  ʉɯ: "uː",
+  u: "uː",
+  iː: "iː",
+  ɪ: "ɪ",
+  ʏ: "ɪ",
+  ʊ: "ʊ",
+  ɨ: "ɪ",
+  ᵻ: "ɪ",
+  e: "e",
+  ø: "e",
+  ɘ: "ə",
+  ɵ: "ə",
+  ɤ: "ɒ",
+  o: "ɔː",
+  ə: "ə",
+  oː: "ɔː",
+  ɛ: "æ",
+  œ: "æ",
+  ɜ: "əː",
+  ɞ: "əː",
+  ʌ: "ʌ",
+  ɔ: "ɔː",
+  ɜː: "əː",
+  uː: "uː",
+  ɔː: "ɔː",
+  ɛː: "æ",
+  æ: "æ",
+  a: "ɑː",
+  ɶ: "ɑː",
+  ɐ: "ɑː",
+  ɑ: "ɑː",
+  ɒ: "ɒ",
+  ɑː: "ɑː",
+  "◌˞": "",
+  ɚ: "ɪə",
+  ɝ: "ɪə",
+  ɹ̩: "r",
+  eɪ: "eɪ",
+  əʊ: "əʊ",
+  oʊ: "əʊ",
+  aɪ: "aɪ",
+  ɔɪ: "ɔɪ",
+  aʊ: "aʊ",
+  iə: "ɪə",
+  ɜr: "ɪə(r)",
+  ɑr: "ɑː(r)",
+  ɔr: "ɔː(r)",
+  oʊr: "əʊ(r)",
+  oːɹ: "ɔː(r)",
+  ir: "iː(r)",
+  ɪɹ: "ɪ(r)",
+  ɔːɹ: "ɔː(r)",
+  ɑːɹ: "ɑː(r)",
+  ʊɹ: "ʊ(r)",
+  ʊr: "ʊ(r)",
+  ɛr: "æ(r)",
+  ɛɹ: "æ(r)",
+  əl: "ə",
+  aɪɚ: "aɪ",
+  aɪə: "aɪ",
+};
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -151,6 +151,7 @@
  "yesterday": "yesterday",
  "play": "play",
  "pause": "pause",
+  "switchPlayMode": "switch play mode",
  "playSingleSegment": "play single segment",
  "playAllSegments": "play all segments",
  "playInLoop": "play in loop",
@@ -241,9 +242,13 @@
  "logoutAndRemoveAllPersonalData": "Logout and remove all personal data",
  "logoutAndRemoveAllPersonalSettings": "Logout and remove all personal settings",
  "hotkeys": "Hotkeys",
+  "system": "System",
+  "player": "Player",
  "quitApp": "Quit APP",
  "openPreferences": "Open preferences",
  "playOrPause": "Play or pause",
+  "playOrPauseRecording": "Play or pause recording",
+  "startOrStopRecording": "start or stop recording",
  "about": "About",
  "currentVersion": "Current version",
  "checkUpdate": "Check update",
@@ -268,8 +273,7 @@
  "editResource": "edit resource",
  "deleteResource": "delete resource",
  "deleteResourceConfirmation": "Are you sure to delete {{name}}?",
-  "transcribeAudioConfirmation": "It will remove the old transcription. Are you sure to transcribe {{name}}",
-  "transcribeVideoConfirmation": "It will remove the old transcription. Are you sure to transcribe {{name}}",
+  "transcribeMediaConfirmation": "It will remove the old transcription. Are you sure to transcribe {{name}}",
  "localFile": "local file",
  "resourcesYouAddedRecently": "resources you added recently",
  "recentlyAdded": "recently added",
@@ -291,6 +295,7 @@
  "deleteRecording": "delete recording",
  "deleteRecordingConfirmation": "Are you sure to delete this recording?",
  "myRecordings": "my recordings",
+  "noRecordingForThisSegmentYet": "No recordings for this segment yet. Press <kbd>R</kbd> to start recording.",
  "lastYear": "last year",
  "less": "less",
  "more": "more",
@@ -474,7 +479,19 @@
  "itMayTakeAWhileToPrepareForTheFirstLoad": "It may take a while to prepare for the first load. Please be patient.",
  "loadingTranscription": "Loading transcription",
  "cannotFindMicrophone": "Cannot find microphone",
+  "savingRecording": "Saving recording",
+  "recordingSaved": "Recording saved",
  "failedToSaveRecording": "Failed to save recording",
  "speechNotCreatedYet": "Speech not created yet",
-  "goToConversation": "Go to conversation"
+  "goToConversation": "Go to conversation",
+  "mediaInfo": "Media Info",
+  "editRegion": "edit region",
+  "dragRegionBorderToEdit": "Drag region border to edit",
+  "startRecording": "start recording",
+  "stopRecording": "stop recording",
+  "playRecording": "play recording",
+  "clickAnyWordToSelect": "Click any words to select. Press shift to select multiple words.",
+  "currentRegionIsBeingEdited": "Current region is being edited",
+  "compare": "compare",
+  "selectRegion": "select region"
 }
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -151,6 +151,7 @@
  "yesterday": "昨天",
  "play": "播放",
  "pause": "暂停",
+  "switchPlayMode": "切换播放模式",
  "playSingleSegment": "播放单句",
  "playAllSegments": "播放所有",
  "playInLoop": "单句循环",
@@ -241,9 +242,13 @@
  "logoutAndRemoveAllPersonalData": "退出登录并删除所有个人数据",
  "logoutAndRemoveAllPersonalSettings": "退出登录并删除所有个人设置选项",
  "hotkeys": "快捷键",
+  "system": "系统",
+  "player": "播放器",
  "quitApp": "退出应用",
  "openPreferences": "打开设置",
  "playOrPause": "播放/暂停",
+  "playOrPauseRecording": "播放/暂停录音",
+  "startOrStopRecording": "开始/结束录音",
  "about": "关于",
  "currentVersion": "当前版本",
  "checkUpdate": "检查更新",
@@ -269,7 +274,6 @@
  "deleteResource": "删除资源",
  "deleteResourceConfirmation": "您确定要删除资源 {{name}} 吗？",
  "transcribeAudioConfirmation": "这将删除原来的语音文本，您确定要重新对 {{name}} 进行语音转文本吗？",
-  "transcribeVideoConfirmation": "这将删除原来的语音文本，您确定要重新对 {{name}} 进行语音转文本吗？",
  "localFile": "本地文件",
  "recentlyAdded": "最近添加",
  "resourcesYouAddedRecently": "最近添加的资源",
@@ -291,6 +295,7 @@
  "deleteRecording": "删除录音",
  "deleteRecordingConfirmation": "您确定要删除录音吗？",
  "myRecordings": "我的练习",
+  "noRecordingForThisSegmentYet": "当前句子还没有练习过。按 <kbd>R</kbd> 键开始录音。",
  "lastYear": "过去一年",
  "less": "更少",
  "more": "更多",
@@ -473,7 +478,19 @@
  "itMayTakeAWhileToPrepareForTheFirstLoad": "首次加载可能需要一些时间，请耐心等候",
  "loadingTranscription": "正在加载语音文本",
  "cannotFindMicrophone": "无法找到麦克风",
+  "savingRecording": "正在保存录音",
+  "recordingSaved": "录音已保存",
  "failedToSaveRecording": "保存录音失败",
  "speechNotCreatedYet": "尚未生成语音",
-  "goToConversation": "前往对话"
+  "goToConversation": "前往对话",
+  "mediaInfo": "资源信息",
+  "editRegion": "修改当前区域",
+  "dragRegionBorderToEdit": "拖动区域边界以修改",
+  "startRecording": "开始录音",
+  "stopRecording": "结束录音",
+  "playRecording": "播放录音",
+  "clickAnyWordToSelect": "点击任意单词可以选中，同时按下 Shift 键可以多选",
+  "currentRegionIsBeingEdited": "当前区域正在编辑中",
+  "compare": "对比",
+  "selectRegion": "选取区域"
 }
--- a/enjoy/src/index.css
+++ b/enjoy/src/index.css
@@ -81,6 +81,12 @@
  }
 }

+@layer components {
+  .scroll {
+    @appply scrollbar-thin scrollbar-thumb-primary scrollbar-track-secondary;
+  }
+}
+
 body {
  user-select: none;
 }
--- a/enjoy/src/main/db/models/audio.ts
+++ b/enjoy/src/main/db/models/audio.ts
@@ -129,6 +129,11 @@ export class Audio extends Model<Audio> {
    return this.getDataValue("metadata").duration;
  }

+  @Column(DataType.VIRTUAL)
+  get mediaType(): string {
+    return "Audio";
+  }
+
  get extname(): string {
    return (
      this.getDataValue("metadata").extname ||
--- a/enjoy/src/main/db/models/recording.ts
+++ b/enjoy/src/main/db/models/recording.ts
@@ -25,6 +25,7 @@ import storage from "@main/storage";
 import { Client } from "@/api";
 import { WEB_API_URL } from "@/constants";
 import { AzureSpeechSdk } from "@main/azure-speech-sdk";
+import Ffmpeg from "@main/ffmpeg";
 import camelcaseKeys from "camelcase-keys";

 const logger = log.scope("db/models/recording");
@@ -299,10 +300,18 @@ export class Recording extends Model<Recording> {
      referenceText?: string;
    }
  ) {
-    const { targetId, targetType, referenceId, referenceText, duration } =
-      params;
+    const { targetId, targetType, referenceId, referenceText } = params;
+    let { duration } = params;
+
+    if (blob.arrayBuffer.byteLength === 0) {
+      throw new Error("Empty recording");
+    }
+
+    const format = blob.type.split("/")[1]?.split(";")?.[0];
+    if (!format) {
+      throw new Error("Unknown recording format");
+    }

-    const format = blob.type.split("/")[1];
    const file = path.join(
      settings.userDataPath(),
      "recordings",
@@ -310,6 +319,18 @@ export class Recording extends Model<Recording> {
    );
    await fs.outputFile(file, Buffer.from(blob.arrayBuffer));

+    try {
+      const ffmpeg = new Ffmpeg();
+      const metadata = await ffmpeg.generateMetadata(file);
+      duration = Math.floor(metadata.format.duration * 1000);
+    } catch (err) {
+      logger.error(err);
+    }
+
+    if (duration === 0) {
+      throw new Error("Failed to get duration of the recording");
+    }
+
    const md5 = await hashFile(file, { algo: "md5" });
    const filename = `${md5}.${format}`;
    fs.renameSync(file, path.join(path.dirname(file), filename));
--- a/enjoy/src/main/db/models/video.ts
+++ b/enjoy/src/main/db/models/video.ts
@@ -129,6 +129,11 @@ export class Video extends Model<Video> {
    return this.getDataValue("metadata").duration;
  }

+  @Column(DataType.VIRTUAL)
+  get mediaType(): string {
+    return "Video";
+  }
+
  get extname(): string {
    return (
      this.getDataValue("metadata").extname ||
--- a/enjoy/src/main/echogarden.ts
+++ b/enjoy/src/main/echogarden.ts
@@ -0,0 +1,68 @@
+import { ipcMain } from "electron";
+import { align } from "echogarden/dist/api/API.js";
+import { AlignmentOptions } from "echogarden/dist/api/API";
+import { AudioSourceParam } from "echogarden/dist/audio/AudioUtilities";
+import path from "path";
+import log from "@main/logger";
+import url from "url";
+import settings from "@main/settings";
+import fs from "fs-extra";
+
+const __filename = url.fileURLToPath(import.meta.url);
+/*
+ * sample files will be in /app.asar.unpacked instead of /app.asar
+ */
+const __dirname = path
+  .dirname(__filename)
+  .replace("app.asar", "app.asar.unpacked");
+
+const logger = log.scope("echogarden");
+class EchogardenWrapper {
+  public align: typeof align;
+
+  constructor() {
+    this.align = align;
+  }
+
+  async check() {
+    const sampleFile = path.join(__dirname, "samples", "jfk.wav");
+    try {
+      const result = await this.align(
+        sampleFile,
+        "And so my fellow Americans ask not what your country can do for you",
+        {}
+      );
+      logger.info(result);
+      fs.writeJsonSync(
+        path.join(settings.cachePath(), "echogarden-check.json"),
+        result,
+        { spaces: 2 }
+      );
+
+      return true;
+    } catch (e) {
+      logger.error(e);
+      return false;
+    }
+  }
+
+  registerIpcHandlers() {
+    ipcMain.handle(
+      "echogarden-align",
+      async (
+        _event,
+        input: AudioSourceParam,
+        transcript: string,
+        options: AlignmentOptions
+      ) => {
+        return this.align(input, transcript, options);
+      }
+    );
+
+    ipcMain.handle("echogarden-check", async (_event) => {
+      return this.check();
+    });
+  }
+}
+
+export default new EchogardenWrapper();
--- a/enjoy/src/main/ffmpeg.ts
+++ b/enjoy/src/main/ffmpeg.ts
@@ -7,6 +7,7 @@ import path from "path";
 import fs from "fs-extra";
 import settings from "./settings";
 import url from "url";
+import { FFMPEG_CONVERT_WAV_OPTIONS } from "@/constants";

 /*
 * ffmpeg and ffprobe bin file will be in /app.asar.unpacked instead of /app.asar
@@ -19,6 +20,8 @@ const __dirname = path
  .dirname(__filename)
  .replace("app.asar", "app.asar.unpacked");

+process.env.FFMPEG_PATH = ffmpegPath;
+
 const logger = log.scope("ffmpeg");
 export default class FfmpegWrapper {
  checkCommand(): Promise<boolean> {
@@ -211,7 +214,7 @@ export default class FfmpegWrapper {
      );
    }

-    options = options || ["-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le"];
+    options = options || FFMPEG_CONVERT_WAV_OPTIONS;

    const ffmpeg = Ffmpeg();
    return new Promise((resolve, reject) => {
--- a/enjoy/src/main/window.ts
+++ b/enjoy/src/main/window.ts
@@ -21,6 +21,7 @@ import { AudibleProvider, TedProvider } from "@main/providers";
 import Ffmpeg from "@main/ffmpeg";
 import { Waveform } from "./waveform";
 import url from "url";
+import echogarden from "./echogarden";

 const __filename = url.fileURLToPath(import.meta.url);
 const __dirname = path.dirname(__filename);
@@ -49,6 +50,9 @@ main.init = () => {
  // Prepare Settings
  settings.registerIpcHandlers();

+  // echogarden
+  echogarden.registerIpcHandlers();
+
  // Whisper
  whisper.registerIpcHandlers();

@@ -433,10 +437,11 @@ ${log}
  // Create the browser window.
  const mainWindow = new BrowserWindow({
    icon: "./assets/icon.png",
-    width: 1600,
-    height: 1200,
-    minWidth: 1024,
-    minHeight: 768,
+    width: 1920,
+    height: 1080,
+    minWidth: 1440,
+    minHeight: 900,
+    fullscreen: true,
    webPreferences: {
      preload: path.join(__dirname, "preload.js"),
    },
--- a/enjoy/src/preload.ts
+++ b/enjoy/src/preload.ts
@@ -350,6 +350,14 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
      return ipcRenderer.invoke("audiowaveform-frequencies", file);
    },
  },
+  echogarden: {
+    align: (input: string, transcript: string, options: any) => {
+      return ipcRenderer.invoke("echogarden-align", input, transcript, options);
+    },
+    check: () => {
+      return ipcRenderer.invoke("echogarden-check");
+    },
+  },
  whisper: {
    config: () => {
      return ipcRenderer.invoke("whisper-config");
--- a/enjoy/src/renderer/components/audios/audio-detail.tsx
+++ b/enjoy/src/renderer/components/audios/audio-detail.tsx
@@ -1,394 +0,0 @@
-import { useEffect, useState, useContext } from "react";
-import {
-  DbProviderContext,
-  AppSettingsProviderContext,
-  AISettingsProviderContext,
-} from "@renderer/context";
-import {
-  LoaderSpin,
-  RecordingsList,
-  PagePlaceholder,
-  MediaPlayer,
-  MediaTranscription,
-} from "@renderer/components";
-import { CheckCircleIcon, LoaderIcon } from "lucide-react";
-import {
-  AlertDialog,
-  AlertDialogHeader,
-  AlertDialogDescription,
-  AlertDialogTitle,
-  AlertDialogContent,
-  AlertDialogFooter,
-  AlertDialogCancel,
-  Button,
-  PingPoint,
-  Progress,
-  ScrollArea,
-  toast,
-} from "@renderer/components/ui";
-import { t } from "i18next";
-import { useTranscribe } from "@renderer/hooks";
-import { useNavigate } from "react-router-dom";
-
-export const AudioDetail = (props: { id?: string; md5?: string }) => {
-  const navigate = useNavigate();
-
-  const { id, md5 } = props;
-  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
-  const { whisperConfig } = useContext(AISettingsProviderContext);
-  const { EnjoyApp, webApi } = useContext(AppSettingsProviderContext);
-
-  const [audio, setAudio] = useState<AudioType | null>(null);
-  const [transcription, setTranscription] = useState<TranscriptionType>(null);
-  const [sharing, setSharing] = useState<boolean>(false);
-
-  // Transcription controls
-  const [transcribing, setTranscribing] = useState<boolean>(false);
-  const { transcribe } = useTranscribe();
-  const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
-
-  // Player controls
-  const [initialized, setInitialized] = useState<boolean>(false);
-  const [currentTime, setCurrentTime] = useState<number>(0);
-  const [seek, setSeek] = useState<{
-    seekTo: number;
-    timestamp: number;
-  }>();
-  const [currentSegmentIndex, setCurrentSegmentIndex] = useState<number>(0);
-  const [zoomRatio, setZoomRatio] = useState<number>(1.0);
-  const [isPlaying, setIsPlaying] = useState(false);
-  const [playMode, setPlayMode] = useState<"loop" | "single" | "all">("all");
-  const [playBackRate, setPlaybackRate] = useState<number>(1);
-  const [displayInlineCaption, setDisplayInlineCaption] =
-    useState<boolean>(true);
-
-  const onTransactionUpdate = (event: CustomEvent) => {
-    const { model, action, record } = event.detail || {};
-    if (model === "Transcription" && action === "update") {
-      setTranscription(record);
-    }
-  };
-
-  const findOrCreateTranscription = async () => {
-    if (!audio) return;
-    if (transcription) return;
-
-    return EnjoyApp.transcriptions
-      .findOrCreate({
-        targetId: audio.id,
-        targetType: "Audio",
-      })
-      .then((transcription) => {
-        setTranscription(transcription);
-      })
-      .catch((err) => {
-        toast.error(err.message);
-      });
-  };
-
-  const generateTranscription = async () => {
-    if (transcribing) return;
-    if (!transcription) {
-      await findOrCreateTranscription();
-    }
-
-    setTranscribing(true);
-    setTranscribingProgress(0);
-    try {
-      const { engine, model, result } = await transcribe(audio.src, {
-        targetId: audio.id,
-        targetType: "Audio",
-      });
-      await EnjoyApp.transcriptions.update(transcription.id, {
-        state: "finished",
-        result,
-        engine,
-        model,
-      });
-    } catch (err) {
-      toast.error(err.message);
-    }
-
-    setTranscribing(false);
-  };
-
-  const findTranscriptionFromWebApi = async () => {
-    if (!transcription) {
-      await findOrCreateTranscription();
-    }
-
-    const res = await webApi.transcriptions({
-      targetMd5: audio.md5,
-    });
-
-    const transcript = (res?.transcriptions || []).filter((t) =>
-      ["base", "small", "medium", "large", "whisper-1"].includes(t.model)
-    )?.[0];
-
-    if (!transcript) {
-      throw new Error("Transcription not found");
-    }
-
-    await EnjoyApp.transcriptions.update(transcription.id, {
-      state: "finished",
-      result: transcript.result,
-      engine: transcript.engine,
-      model: transcript.model,
-    });
-  };
-
-  const findOrGenerateTranscription = async () => {
-    try {
-      await findTranscriptionFromWebApi();
-    } catch (err) {
-      console.error(err);
-      await generateTranscription();
-    }
-  };
-
-  const handleShare = async () => {
-    if (!audio.source && !audio.isUploaded) {
-      try {
-        await EnjoyApp.audios.upload(audio.id);
-      } catch (err) {
-        toast.error(t("shareFailed"), {
-          description: err.message,
-        });
-        return;
-      }
-    }
-    webApi
-      .createPost({
-        targetType: "Audio",
-        targetId: audio.id,
-      })
-      .then(() => {
-        toast.success(t("sharedSuccessfully"), {
-          description: t("sharedAudio"),
-        });
-      })
-      .catch((err) => {
-        toast.error(t("shareFailed"), {
-          description: err.message,
-        });
-      });
-    setSharing(false);
-  };
-
-  useEffect(() => {
-    const where = id ? { id } : { md5 };
-    EnjoyApp.audios.findOne(where).then((audio) => {
-      if (audio) {
-        setAudio(audio);
-      } else {
-        toast.error(t("models.audio.notFound"));
-      }
-    });
-  }, [id, md5]);
-
-  useEffect(() => {
-    if (!audio) return;
-
-    findOrCreateTranscription();
-  }, [audio]);
-
-  useEffect(() => {
-    if (!initialized) return;
-    if (!transcription) return;
-
-    addDblistener(onTransactionUpdate);
-
-    if (transcription?.state == "pending") {
-      findOrGenerateTranscription();
-    }
-
-    if (whisperConfig.service === "local") {
-      EnjoyApp.whisper.onProgress((_, p: number) => {
-        if (p > 100) p = 100;
-        setTranscribingProgress(p);
-      });
-    }
-
-    return () => {
-      removeDbListener(onTransactionUpdate);
-      EnjoyApp.whisper.removeProgressListeners();
-    };
-  }, [md5, transcription, initialized]);
-
-  if (!audio) {
-    return <LoaderSpin />;
-  }
-
-  if (!audio.src) {
-    return (
-      <PagePlaceholder placeholder="invalid" extra="cannot find play source" />
-    );
-  }
-
-  return (
-    <div className="relative" data-testid="audio-detail">
-      <div className={`grid grid-cols-7 gap-4 ${initialized ? "" : "blur-sm"}`}>
-        <div className="col-span-5 h-[calc(100vh-6.5rem)] flex flex-col">
-          <MediaPlayer
-            mediaId={audio.id}
-            mediaType="Audio"
-            mediaUrl={audio.src}
-            mediaMd5={audio.md5}
-            transcription={transcription}
-            currentTime={currentTime}
-            setCurrentTime={setCurrentTime}
-            currentSegmentIndex={currentSegmentIndex}
-            setCurrentSegmentIndex={setCurrentSegmentIndex}
-            recordButtonVisible={true}
-            seek={seek}
-            initialized={initialized}
-            setInitialized={setInitialized}
-            zoomRatio={zoomRatio}
-            setZoomRatio={setZoomRatio}
-            isPlaying={isPlaying}
-            setIsPlaying={setIsPlaying}
-            playMode={playMode}
-            setPlayMode={setPlayMode}
-            playBackRate={playBackRate}
-            setPlaybackRate={setPlaybackRate}
-            displayInlineCaption={displayInlineCaption}
-            setDisplayInlineCaption={setDisplayInlineCaption}
-            onShare={() => setSharing(true)}
-            onDecoded={({ duration, sampleRate }) => {
-              if (audio.duration) return;
-
-              EnjoyApp.audios.update(audio.id, {
-                metadata: Object.assign({}, audio.metadata, {
-                  duration,
-                  sampleRate,
-                }),
-              });
-            }}
-          />
-
-          <ScrollArea className={`flex-1 relative bg-muted`}>
-            <RecordingsList
-              key={`recordings-list-${audio.id}-${currentSegmentIndex}`}
-              targetId={audio.id}
-              targetType="Audio"
-              referenceText={transcription?.result?.[currentSegmentIndex]?.text}
-              referenceId={currentSegmentIndex}
-            />
-          </ScrollArea>
-        </div>
-
-        <div className="col-span-2 h-[calc(100vh-6.5rem)]">
-          <MediaTranscription
-            mediaId={audio.id}
-            mediaType="Audio"
-            mediaName={audio.name}
-            transcription={transcription}
-            transcribing={transcribing}
-            progress={transcribingProgress}
-            transcribe={generateTranscription}
-            currentSegmentIndex={currentSegmentIndex}
-            onSelectSegment={(index) => {
-              if (currentSegmentIndex === index) return;
-
-              const segment = transcription?.result?.[index];
-              if (!segment) return;
-
-              if (playMode === "loop" && isPlaying) setIsPlaying(false);
-              setSeek({
-                seekTo: segment.offsets.from / 1000,
-                timestamp: Date.now(),
-              });
-            }}
-          />
-        </div>
-      </div>
-
-      <AlertDialog open={sharing} onOpenChange={(value) => setSharing(value)}>
-        <AlertDialogContent>
-          <AlertDialogHeader>
-            <AlertDialogTitle>{t("shareAudio")}</AlertDialogTitle>
-            <AlertDialogDescription>
-              {t("areYouSureToShareThisAudioToCommunity")}
-            </AlertDialogDescription>
-          </AlertDialogHeader>
-          <AlertDialogFooter>
-            <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
-            <Button variant="default" onClick={handleShare}>
-              {t("share")}
-            </Button>
-          </AlertDialogFooter>
-        </AlertDialogContent>
-      </AlertDialog>
-
-      {/* Show loading progress until waveform is decoded & transcribed */}
-      <AlertDialog open={!initialized || !Boolean(transcription?.result)}>
-        <AlertDialogContent>
-          <AlertDialogHeader>
-            <AlertDialogTitle>{t("preparingAudio")}</AlertDialogTitle>
-            <AlertDialogDescription>
-              {t("itMayTakeAWhileToPrepareForTheFirstLoad")}
-            </AlertDialogDescription>
-          </AlertDialogHeader>
-
-          <div className="py-4">
-            {initialized ? (
-              <div className="mb-4 flex items-center space-x-4">
-                <CheckCircleIcon className="w-4 h-4 text-green-500" />
-                <span>{t("waveformIsDecoded")}</span>
-              </div>
-            ) : (
-              <div className="mb-4 flex items-center space-x-4">
-                <LoaderIcon className="w-4 h-4 animate-spin" />
-                <span>{t("decodingWaveform")}</span>
-              </div>
-            )}
-
-            {!transcription ? (
-              <div className="flex items-center space-x-4">
-                <LoaderIcon className="w-4 h-4 animate-spin" />
-                <span>{t("loadingTranscription")}</span>
-              </div>
-            ) : transcription.result ? (
-              <div className="flex items-center space-x-4">
-                <CheckCircleIcon className="w-4 h-4 text-green-500" />
-                <span>{t("transcribedSuccessfully")}</span>
-              </div>
-            ) : transcribing ? (
-              <div className="">
-                <div className="flex items-center space-x-4 mb-2">
-                  <PingPoint colorClassName="bg-yellow-500" />
-                  <span>{t("transcribing")}</span>
-                </div>
-                {whisperConfig.service === "local" && (
-                  <Progress value={transcribingProgress} />
-                )}
-              </div>
-            ) : (
-              <div className="flex items-center space-x-4">
-                <PingPoint colorClassName="bg-muted" />
-                <div className="inline">
-                  <span>{t("notTranscribedYet")}</span>
-                  {initialized && (
-                    <Button
-                      onClick={generateTranscription}
-                      className="ml-4"
-                      size="sm"
-                    >
-                      {t("transcribe")}
-                    </Button>
-                  )}
-                </div>
-              </div>
-            )}
-          </div>
-
-          <AlertDialogFooter>
-            <Button variant="secondary" onClick={() => navigate(-1)}>
-              {t("cancel")}
-            </Button>
-          </AlertDialogFooter>
-        </AlertDialogContent>
-      </AlertDialog>
-    </div>
-  );
-};
--- a/enjoy/src/renderer/components/audios/audio-player.tsx
+++ b/enjoy/src/renderer/components/audios/audio-player.tsx
@@ -0,0 +1,72 @@
+import { useEffect, useContext, useRef } from "react";
+import { MediaPlayerProviderContext } from "@renderer/context";
+import {
+  MediaLoadingModal,
+  MediaCaption,
+  MediaPlayerControls,
+  MediaTabs,
+  MediaCurrentRecording,
+} from "@renderer/components";
+import { formatDuration } from "@renderer/lib/utils";
+import { useAudio } from "@renderer/hooks";
+
+export const AudioPlayer = (props: { id?: string; md5?: string }) => {
+  const { id, md5 } = props;
+  const { media, currentTime, setMedia, setRef } = useContext(
+    MediaPlayerProviderContext
+  );
+  const { audio } = useAudio({ id, md5 });
+  const ref = useRef(null);
+
+  useEffect(() => {
+    if (!audio) return;
+
+    setMedia(audio);
+  }, [audio]);
+
+  useEffect(() => {
+    setRef(ref);
+  }, [ref]);
+
+  return (
+    <div data-testid="audio-player">
+      <div className="h-[calc(100vh-37.5rem)] mb-4">
+        <div className="grid grid-cols-3 gap-4 px-6 h-full">
+          <div className="col-span-1 rounded-lg border shadow-lg h-[calc(100vh-37.5rem)]">
+            <MediaTabs />
+          </div>
+          <div className="col-span-2 h-[calc(100vh-37.5rem)]">
+            <MediaCaption />
+          </div>
+        </div>
+      </div>
+
+      <div className="h-[33rem] flex flex-col">
+        <div className="h-[13rem] py-2 px-6 mb-4">
+          <MediaCurrentRecording />
+        </div>
+
+        <div className="w-full h-[13rem] px-6 py-2 mb-4">
+          <div className="border rounded-xl shadow-lg relative">
+            <div data-testid="media-player-container" ref={ref} />
+            <div className="absolute right-2 top-1">
+              <span className="text-sm">
+                {formatDuration(currentTime || 0)}
+              </span>
+              <span className="mx-1">/</span>
+              <span className="text-sm">
+                {formatDuration(media?.duration || 0)}
+              </span>
+            </div>
+          </div>
+        </div>
+
+        <div className="w-full bg-background z-10 shadow-xl">
+          <MediaPlayerControls />
+        </div>
+      </div>
+
+      <MediaLoadingModal />
+    </div>
+  );
+};
--- a/enjoy/src/renderer/components/audios/index.ts
+++ b/enjoy/src/renderer/components/audios/index.ts
@@ -1,8 +1,9 @@
 export * from "./audios-table";
 export * from "./audio-edit-form";
-export * from "./audio-detail";

 export * from "./audios-component";
 export * from "./audible-books-segment";
 export * from "./audios-segment";
 export * from "./audio-card";
+
+export * from "./audio-player";
--- a/enjoy/src/renderer/components/conversations/speech-player.tsx
+++ b/enjoy/src/renderer/components/conversations/speech-player.tsx
@@ -1,5 +1,6 @@
 import { useEffect, useState, useRef, useCallback } from "react";
-import { PitchContour } from "@renderer/components";
+import { renderPitchContour } from "@renderer/lib/utils";
+import { extractFrequencies } from "@/utils";
 import WaveSurfer from "wavesurfer.js";
 import { Button, Skeleton } from "@renderer/components/ui";
 import { PlayIcon, PauseIcon } from "lucide-react";
@@ -59,17 +60,25 @@ export const SpeechPlayer = (props: {
      wavesurfer.on("pause", () => {
        setIsPlaying(false);
      }),
-      wavesurfer.on("decode", () => {
+      wavesurfer.on("ready", () => {
        setDuration(wavesurfer.getDuration());
        const peaks = wavesurfer.getDecodedData().getChannelData(0);
        const sampleRate = wavesurfer.options.sampleRate;
-        wavesurfer.renderer.getWrapper().appendChild(
-          PitchContour({
-            peaks,
-            sampleRate,
-            height,
-          })
-        );
+        const data = extractFrequencies({ peaks, sampleRate });
+        setTimeout(() => {
+          renderPitchContour({
+            wrapper: wavesurfer.getWrapper(),
+            canvasId: `pitch-contour-${speech.id}-canvas`,
+            labels: new Array(data.length).fill(""),
+            datasets: [
+              {
+                data,
+                cubicInterpolationMode: "monotone",
+                pointRadius: 1,
+              },
+            ],
+          });
+        }, 1000);
        setInitialized(true);
      }),
    ];
--- a/enjoy/src/renderer/components/index.ts
+++ b/enjoy/src/renderer/components/index.ts
@@ -25,7 +25,6 @@ export * from "./login-form";
 export * from "./choose-library-path-input";
 export * from "./whisper-model-options";

-export * from "./pitch-contour";
 export * from "./reset-all-button";

 export * from "./loader-spin";
--- a/enjoy/src/renderer/components/medias/index.ts
+++ b/enjoy/src/renderer/components/medias/index.ts
@@ -1,5 +1,11 @@
-export * from "./add-media-button";
-export * from "./media-player";
 export * from "./media-player-controls";
 export * from "./media-caption";
+export * from "./media-info-panel";
+export * from "./media-recordings";
+export * from "./media-current-recording";
+export * from "./media-recorder";
 export * from "./media-transcription";
+export * from "./media-player";
+export * from "./media-tabs";
+export * from "./media-loading-modal";
+export * from "./add-media-button";
--- a/enjoy/src/renderer/components/medias/media-caption.tsx
+++ b/enjoy/src/renderer/components/medias/media-caption.tsx
@@ -1,91 +1,75 @@
-import { useState, useEffect } from "react";
-import { cn } from "@renderer/lib/utils";
+import { useEffect, useState, useContext } from "react";
+import { MediaPlayerProviderContext } from "@renderer/context";
+import cloneDeep from "lodash/cloneDeep";
 import {
  Button,
-  DropdownMenu,
-  DropdownMenuContent,
-  DropdownMenuItem,
-  DropdownMenuTrigger,
-  Popover,
-  PopoverContent,
-  PopoverAnchor,
  toast,
+  ScrollArea,
+  Separator,
 } from "@renderer/components/ui";
-import { LookupResult } from "@renderer/components";
-import {
-  ChevronDownIcon,
-  LanguagesIcon,
-  PlayIcon,
-  LoaderIcon,
-  SpeechIcon,
-} from "lucide-react";
 import { t } from "i18next";
+import { LanguagesIcon, SpeechIcon } from "lucide-react";
+import { Timeline } from "echogarden/dist/utilities/Timeline.d.js";
+import { IPA_MAPPING } from "@/constants";
 import { useAiCommand } from "@renderer/hooks";
+import { LoaderIcon } from "lucide-react";

-export const MediaCaption = (props: {
-  mediaId: string;
-  mediaType: string;
-  currentTime: number;
-  transcription: TranscriptionResultSegmentGroupType;
-  onSeek?: (time: number) => void;
-  className?: string;
-  isPlaying: boolean;
-  setIsPlaying: (isPlaying: boolean) => void;
-}) => {
+export const MediaCaption = () => {
  const {
-    transcription,
+    wavesurfer,
+    currentSegmentIndex,
    currentTime,
-    onSeek,
-    className,
-    isPlaying,
-    setIsPlaying,
-  } = props;
+    transcription,
+    regions,
+    activeRegion,
+    setActiveRegion,
+    editingRegion,
+    setEditingRegion,
+    setTranscriptionDraft,
+  } = useContext(MediaPlayerProviderContext);
  const [activeIndex, setActiveIndex] = useState<number>(0);
-  const [selected, setSelected] = useState<{
-    index: number;
-    word: string;
-    position?: {
-      top: number;
-      left: number;
-    };
-  }>();
+  const [selectedIndices, setSelectedIndices] = useState<number[]>([]);
+  const [multiSelecting, setMultiSelecting] = useState<boolean>(false);
+
+  const [displayIpa, setDisplayIpa] = useState<boolean>(true);
+
  const [translation, setTranslation] = useState<string>();
  const [translating, setTranslating] = useState<boolean>(false);
  const [displayTranslation, setDisplayTranslation] = useState<boolean>(false);

-  const [ipa, setIpa] = useState<{ word?: string; ipa?: string }[]>([]);
-  const [ipaGenerating, setIpaGenerating] = useState<boolean>(false);
-  const [displayIpa, setDisplayIpa] = useState<boolean>(false);
+  const [lookingUp, setLookingUp] = useState<boolean>(false);
+  const [lookupResult, setLookupResult] = useState<LookupType>();

-  const { translate, pronounce } = useAiCommand();
+  const caption = (transcription?.result?.timeline as Timeline)?.[
+    currentSegmentIndex
+  ];

-  const toggleIpa = async () => {
-    if (ipaGenerating) return;
+  const { translate, lookupWord } = useAiCommand();

-    if (ipa.length > 0) {
-      setDisplayIpa(!displayIpa);
-      return;
-    }
+  const lookup = () => {
+    if (selectedIndices.length === 0) return;

-    setIpaGenerating(true);
-    toast.promise(
-      pronounce(transcription.text)
-        .then((words) => {
-          if (words?.length > 0) {
-            setIpa(words);
-            setDisplayIpa(true);
-          }
-        })
-        .finally(() => {
-          setIpaGenerating(false);
-        }),
-      {
-        loading: t("generatingIpa"),
-        success: t("generatedIpaSuccessfully"),
-        error: (err) => t("generatingIpaFailed", { error: err.message }),
-        position: "bottom-right",
-      }
-    );
+    const word = selectedIndices
+      .map((index) => caption.timeline[index].text)
+      .join(" ");
+    setLookingUp(true);
+    lookupWord({
+      word,
+      context: caption.text,
+      sourceId: transcription.targetId,
+      sourceType: transcription.targetType,
+    })
+      .then((lookup) => {
+        if (lookup?.meaning) {
+          setLookupResult(lookup);
+        }
+      })
+      .catch((error) => {
+        toast.error(error.message);
+      })
+      .finally(() => {
+        setLookingUp(false);
+      });
  };

  const toggleTranslation = async () => {
@@ -97,7 +81,7 @@ export const MediaCaption = (props: {
    }

    toast.promise(
-      translate(transcription.text)
+      translate(caption.text)
        .then((result) => {
          if (result) {
            setTranslation(result);
@@ -116,177 +100,370 @@ export const MediaCaption = (props: {
    );
  };

+  const toggleMultiSelect = (event: KeyboardEvent) => {
+    setMultiSelecting(event.shiftKey && event.type === "keydown");
+  };
+
+  const toggleRegion = (index: number) => {
+    if (!activeRegion) return;
+    if (editingRegion) {
+      toast.warning(t("currentRegionIsBeingEdited"));
+      return;
+    }
+
+    const word = caption.timeline[index];
+    if (!word) return;
+
+    const start = word.startTime;
+    const end = word.endTime;
+    const regionStart = activeRegion.start;
+    const regionEnd = activeRegion.end;
+
+    if (activeRegion.id.startsWith("word-region")) {
+      if (start >= regionStart && end <= regionEnd) {
+        setActiveRegion(
+          regions.getRegions().find((r) => r.id.startsWith("segment-region"))
+        );
+      } else if (multiSelecting) {
+        const region = regions.addRegion({
+          id: `word-region-${index}`,
+          start: Math.min(start, regionStart),
+          end: Math.max(end, regionEnd),
+          color: "#fb6f9233",
+          drag: false,
+          resize: editingRegion,
+        });
+
+        setActiveRegion(region);
+      } else {
+        const region = regions.addRegion({
+          id: `word-region-${index}`,
+          start,
+          end,
+          color: "#fb6f9233",
+          drag: false,
+          resize: editingRegion,
+        });
+
+        setActiveRegion(region);
+      }
+      activeRegion.remove();
+    } else {
+      const region = regions.addRegion({
+        id: `word-region-${index}`,
+        start,
+        end,
+        color: "#fb6f9233",
+        drag: false,
+        resize: false,
+      });
+
+      setActiveRegion(region);
+    }
+  };
+
+  const markPhoneRegions = () => {
+    const phoneRegions = regions
+      .getRegions()
+      .filter((r) => r.id.startsWith("phone-region"));
+    if (phoneRegions.length > 0) {
+      phoneRegions.forEach((r) => {
+        r.remove();
+        r.unAll();
+      });
+      return;
+    }
+
+    if (!activeRegion) return;
+    if (!activeRegion.id.startsWith("word-region")) return;
+    if (!selectedIndices) return;
+
+    selectedIndices.forEach((index) => {
+      const word = caption.timeline[index];
+
+      word.timeline.forEach((token) => {
+        token.timeline.forEach((phone) => {
+          const region = regions.addRegion({
+            id: `phone-region-${index}`,
+            start: phone.startTime,
+            end: phone.endTime,
+            color: "#efefefef",
+            drag: false,
+            resize: editingRegion,
+          });
+          region.on("click", () => {
+            region.play();
+          });
+        });
+      });
+    });
+  };
+
  useEffect(() => {
-    if (!transcription) return;
-    const time = Math.round(currentTime * 1000);
-    const index = transcription.segments.findIndex(
-      (w) => time >= w.offsets.from && time < w.offsets.to
+    if (!caption) return;
+
+    const index = caption.timeline.findIndex(
+      (w) => currentTime >= w.startTime && currentTime < w.endTime
    );

    if (index !== activeIndex) {
      setActiveIndex(index);
    }
-  }, [currentTime, transcription]);
+  }, [currentTime, caption]);

-  if (!transcription) return null;
-  if (Math.round(currentTime * 1000) < transcription.offsets.from) return null;
+  useEffect(() => {
+    if (!caption?.timeline) return;
+    if (!activeRegion) return;
+
+    if (!activeRegion.id.startsWith("word-region")) {
+      setSelectedIndices([]);
+      return;
+    }
+
+    const indices: number[] = [];
+    caption.timeline.forEach((w, index) => {
+      if (
+        w.startTime >= activeRegion.start &&
+        (w.endTime <= activeRegion.end ||
+          // The last word's end time may be a little greater than the duration of the audio in somehow.
+          w.endTime > wavesurfer.getDuration())
+      ) {
+        indices.push(index);
+      }
+    });
+
+    if (indices.length > 0) {
+      const el = document.getElementById(
+        `word-${currentSegmentIndex}-${indices[0]}`
+      );
+    }
+    setSelectedIndices(indices);
+    setLookupResult(undefined);
+  }, [caption, activeRegion]);
+
+  useEffect(() => {
+    if (!activeRegion) return;
+    if (!activeRegion.id.startsWith("word-region")) return;
+
+    const region = regions.addRegion({
+      id: `word-region-${selectedIndices.join("-")}`,
+      start: activeRegion.start,
+      end: activeRegion.end,
+      color: "#fb6f9233",
+      drag: false,
+      resize: editingRegion,
+    });
+
+    activeRegion.remove();
+    setActiveRegion(region);
+
+    const subscriptions = [
+      regions.on("region-updated", (region) => {
+        if (!region.id.startsWith("word-region")) return;
+
+        const draft = cloneDeep(transcription.result);
+        const draftCaption = draft.timeline[currentSegmentIndex];
+
+        const firstIndex = selectedIndices[0];
+        const lastIndex = selectedIndices[selectedIndices.length - 1];
+        const firstWord = draftCaption.timeline[firstIndex];
+        const lastWord = draftCaption.timeline[lastIndex];
+
+        // If no word is selected somehow, then ignore the update.
+        if (!firstWord || !lastWord) {
+          setEditingRegion(false);
+          return;
+        }
+
+        firstWord.startTime = region.start;
+        lastWord.endTime = region.end;
+
+        /* Update the timeline of the previous and next words
+         * It happens only when regions are intersecting with the previous or next word.
+         * It will ignore if the previous/next word's position changed in timestamps.
+         */
+        const prevWord = draftCaption.timeline[firstIndex - 1];
+        const nextWord = draftCaption.timeline[lastIndex + 1];
+        if (
+          prevWord &&
+          prevWord.endTime > region.start &&
+          prevWord.startTime < region.start
+        ) {
+          prevWord.endTime = region.start;
+        }
+        if (
+          nextWord &&
+          nextWord.startTime < region.end &&
+          nextWord.endTime > region.end
+        ) {
+          nextWord.startTime = region.end;
+        }
+
+        /*
+         * If the last word is the last word of the segment, then update the segment's end time.
+         */
+        if (lastIndex === draftCaption.timeline.length - 1) {
+          draftCaption.endTime = region.end;
+        }
+
+        setTranscriptionDraft(draft);
+      }),
+    ];
+
+    return () => {
+      subscriptions.forEach((unsub) => unsub());
+    };
+  }, [editingRegion]);
+
+  useEffect(() => {
+    setTranslation(undefined);
+    setDisplayTranslation(false);
+  }, [caption]);
+
+  useEffect(() => {
+    document.addEventListener("keydown", (event: KeyboardEvent) =>
+      toggleMultiSelect(event)
+    );
+    document.addEventListener("keyup", (event: KeyboardEvent) =>
+      toggleMultiSelect(event)
+    );
+
+    return () => {
+      document.removeEventListener("keydown", toggleMultiSelect);
+      document.removeEventListener("keyup", toggleMultiSelect);
+    };
+  }, []);
+
+  if (!caption) return null;

  return (
-    <div className={cn("relative px-4 py-2 text-lg", className)}>
-      <div className="flex items-start space-x-4">
-        <div className="flex-1">
-          <div className="flex flex-wrap">
-            {(transcription.segments || []).map((w, index) => (
-              <div
-                key={index}
-                className={`mr-1 cursor-pointer hover:bg-red-500/10 ${
-                  index === activeIndex ? "text-red-500" : ""
-                }`}
-                onClick={(event) => {
-                  setSelected({
-                    index,
-                    word: w.text,
-                    position: {
-                      top:
-                        event.currentTarget.offsetTop +
-                        event.currentTarget.offsetHeight,
-                      left: event.currentTarget.offsetLeft,
-                    },
-                  });
-
-                  setIsPlaying(false);
-                  if (onSeek) onSeek(w.offsets.from / 1000);
-                }}
-              >
-                <div>{w.text}</div>
-                {displayIpa &&
-                  ipa.find(
-                    (i) =>
-                      i.word.trim() === w.text.replace(/[\.",?!]/g, "").trim()
-                  )?.ipa && (
-                    <div className="text-sm text-foreground/70 font-serif">
-                      {
-                        ipa.find(
-                          (i) =>
-                            i.word.trim() ===
-                            w.text.replace(/[\.",?!]/g, "").trim()
-                        )?.ipa
-                      }
-                    </div>
-                  )}
-              </div>
-            ))}
-          </div>
-          {displayTranslation && translation && (
-            <div className="select-text py-2 text-sm text-foreground/70">
-              {translation}
-            </div>
-          )}
+    <div className="h-full flex justify-between space-x-4">
+      <ScrollArea className="flex-1 px-6 py-4 font-serif h-full border shadow-lg rounded-lg">
+        <div className="flex flex-wrap mb-4">
+          {/* use the words splitted by caption text if it is matched with the timeline length, otherwise use the timeline */}
+          {caption.text.split(" ").length === caption.timeline.length
+            ? caption.text.split(" ").map((word, index) => (
+                <div
+                  key={index}
+                  id={`word-${currentSegmentIndex}-${index}`}
+                  className={`pr-2 pb-2 cursor-pointer hover:bg-red-500/10 ${
+                    index === activeIndex ? "text-red-500" : ""
+                  } ${selectedIndices.includes(index) ? "bg-red-500/10" : ""}`}
+                  onClick={() => toggleRegion(index)}
+                >
+                  <div className="">
+                    <div className="text-2xl">{word}</div>
+                    {displayIpa && (
+                      <div className="text-muted-foreground">
+                        {caption.timeline[index].timeline
+                          .map((t) => t.timeline.map((s) => s.text).join(""))
+                          .join(" · ")}
+                      </div>
+                    )}
+                  </div>
+                </div>
+              ))
+            : (caption.timeline || []).map((w, index) => (
+                <div
+                  key={index}
+                  id={`word-${currentSegmentIndex}-${index}`}
+                  className={`pr-2 pb-2 cursor-pointer hover:bg-red-500/10 ${
+                    index === activeIndex ? "text-red-500" : ""
+                  } ${
+                    selectedIndices.includes(index)
+                      ? "bg-red-500/10 selected"
+                      : ""
+                  }`}
+                  onClick={() => toggleRegion(index)}
+                >
+                  <div className="">
+                    <div className="text-2xl">{w.text}</div>
+                    {displayIpa && (
+                      <div className="text-muted-foreground">
+                        {w.timeline
+                          .map((t) => t.timeline.map((s) => s.text).join(""))
+                          .join(" · ")}
+                      </div>
+                    )}
+                  </div>
+                </div>
+              ))}
        </div>

-        <DropdownMenu>
-          <DropdownMenuTrigger asChild>
-            <Button variant="ghost" size="icon">
-              <ChevronDownIcon className="w-4 h-4" />
-            </Button>
-          </DropdownMenuTrigger>
-          <DropdownMenuContent>
-            <DropdownMenuItem
-              className="cursor-pointer capitalize"
-              disabled={translating}
-              onClick={toggleTranslation}
-            >
-              {translating ? (
-                <LoaderIcon className="w-4 h-4 mr-2 animate-spin" />
-              ) : (
-                <LanguagesIcon className="w-4 h-4 mr-2" />
-              )}
-              <span>{t("translate")}</span>
-            </DropdownMenuItem>
-            <DropdownMenuItem
-              className="cursor-pointer capitalize"
-              disabled={ipaGenerating}
-              onClick={toggleIpa}
-            >
-              {ipaGenerating ? (
-                <LoaderIcon className="w-4 h-4 mr-2 animate-spin" />
-              ) : (
-                <SpeechIcon className="w-4 h-4 mr-2" />
-              )}
-              <span>{t("displayIpa")}</span>
-            </DropdownMenuItem>
-          </DropdownMenuContent>
-        </DropdownMenu>
-      </div>
+        {displayTranslation && translation && (
+          <>
+            <Separator className="my-2" />
+            <div className="text-sm font-semibold py-2">{t("translation")}</div>
+            <div className="select-text py-2 text-sm text-foreground">
+              {translation}
+            </div>
+          </>
+        )}

-      <Popover
-        open={Boolean(selected) && !isPlaying}
-        onOpenChange={(value) => {
-          if (!value) setSelected(null);
-        }}
-      >
-        <PopoverAnchor
-          className="absolute w-0 h-0"
-          style={{
-            top: selected?.position?.top,
-            left: selected?.position?.left,
-          }}
-        ></PopoverAnchor>
-        <PopoverContent
-          className="w-full max-w-md p-0"
-          updatePositionStrategy="always"
+        {selectedIndices.length > 0 && (
+          <>
+            <Separator className="my-2" />
+            <div className="flex flex-wrap items-center space-x-2 select-text mb-4">
+              {selectedIndices.map((index) => {
+                const word = caption.timeline[index];
+                if (!word) return;
+                return (
+                  <div key={index}>
+                    <div className="font-serif text-lg font-semibold tracking-tight">
+                      {word.text}
+                    </div>
+                    <div className="text-sm text-serif text-muted-foreground">
+                      {word.timeline
+                        .map((t) => t.timeline.map((s) => s.text).join(""))
+                        .join(" · ")}
+                    </div>
+                  </div>
+                );
+              })}
+            </div>
+
+            {lookupResult ? (
+              <div className="py-2 select-text">
+                <div className="text-serif">
+                  {lookupResult.meaning.translation}
+                </div>
+                <div className="text-serif">
+                  {lookupResult.meaning.definition}
+                </div>
+              </div>
+            ) : (
+              <div className="flex items-center py-2">
+                <Button size="sm" disabled={lookingUp} onClick={lookup}>
+                  {lookingUp && (
+                    <LoaderIcon className="animate-spin w-4 h-4 mr-2" />
+                  )}
+                  <span>{t("translate")}</span>
+                </Button>
+              </div>
+            )}
+          </>
+        )}
+      </ScrollArea>
+
+      <div className="flex flex-col space-y-2">
+        <Button
+          variant={displayTranslation ? "secondary" : "outline"}
+          size="icon"
+          className="rounded-full w-8 h-8 p-0"
+          disabled={translating}
+          onClick={toggleTranslation}
        >
-          {selected?.word && (
-            <ResourceCaptionSelectionMenu
-              word={selected.word}
-              context={transcription.segments
-                .map((w) => w.text)
-                .join(" ")
-                .trim()}
-              mediaId={props.mediaId}
-              mediaType={props.mediaType}
-              onPlay={() => {
-                setIsPlaying(true);
-              }}
-            />
-          )}
-        </PopoverContent>
-      </Popover>
-    </div>
-  );
-};
-
-const ResourceCaptionSelectionMenu = (props: {
-  word: string;
-  context: string;
-  mediaId: string;
-  mediaType: string;
-  onPlay: () => void;
-}) => {
-  const { word, context, mediaId, mediaType, onPlay } = props;
-  const [translating, setTranslating] = useState<boolean>(false);
-
-  if (!word) return null;
-
-  if (translating) {
-    return (
-      <LookupResult
-        word={word}
-        context={context}
-        sourceId={mediaId}
-        sourceType={mediaType}
-      />
-    );
-  }
-
-  return (
-    <div className="flex items-center p-1">
-      <Button onClick={onPlay} variant="ghost" size="icon">
-        <PlayIcon size={16} />
-      </Button>
-      <Button onClick={() => setTranslating(true)} variant="ghost" size="icon">
-        <LanguagesIcon size={16} />
-      </Button>
+          <LanguagesIcon className="w-4 h-4" />
+        </Button>
+        <Button
+          variant={displayIpa ? "secondary" : "outline"}
+          size="icon"
+          className="rounded-full w-8 h-8 p-0"
+          onClick={() => setDisplayIpa(!displayIpa)}
+        >
+          <SpeechIcon className="w-4 h-4" />
+        </Button>
+      </div>
    </div>
  );
 };
--- a/enjoy/src/renderer/components/medias/media-current-recording.tsx
+++ b/enjoy/src/renderer/components/medias/media-current-recording.tsx
@@ -0,0 +1,511 @@
+import { useEffect, useContext, useRef, useState } from "react";
+import {
+  AppSettingsProviderContext,
+  MediaPlayerProviderContext,
+} from "@renderer/context";
+import { MediaRecorder, RecordingDetail } from "@renderer/components";
+import { renderPitchContour } from "@renderer/lib/utils";
+import { extractFrequencies } from "@/utils";
+import WaveSurfer from "wavesurfer.js";
+import Regions from "wavesurfer.js/dist/plugins/regions";
+import {
+  AlertDialog,
+  AlertDialogContent,
+  AlertDialogDescription,
+  AlertDialogFooter,
+  AlertDialogHeader,
+  AlertDialogTitle,
+  AlertDialogCancel,
+  AlertDialogAction,
+  Button,
+  DropdownMenu,
+  DropdownMenuItem,
+  DropdownMenuTrigger,
+  DropdownMenuContent,
+  toast,
+  Sheet,
+  SheetContent,
+  SheetHeader,
+  SheetClose,
+} from "@renderer/components/ui";
+import {
+  GitCompareIcon,
+  PauseIcon,
+  PlayIcon,
+  Share2Icon,
+  GaugeCircleIcon,
+  ChevronDownIcon,
+  MoreVerticalIcon,
+  TextCursorInputIcon,
+} from "lucide-react";
+import { t } from "i18next";
+import { formatDuration } from "@renderer/lib/utils";
+import { useHotkeys } from "react-hotkeys-hook";
+
+export const MediaCurrentRecording = (props: { height?: number }) => {
+  const { height = 192 } = props;
+  const {
+    isRecording,
+    currentRecording,
+    renderPitchContour: renderMediaPitchContour,
+    regions: mediaRegions,
+    activeRegion: mediaActiveRegion,
+    wavesurfer,
+    zoomRatio,
+    editingRegion,
+    currentTime: mediaCurrentTime,
+  } = useContext(MediaPlayerProviderContext);
+  const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext);
+  const [player, setPlayer] = useState(null);
+  const [regions, setRegions] = useState<Regions | null>(null);
+  const [currentTime, setCurrentTime] = useState(0);
+
+  const [detailIsOpen, setDetailIsOpen] = useState(false);
+  const [isComparing, setIsComparing] = useState(false);
+  const [isSharing, setIsSharing] = useState(false);
+  const [isSelectingRegion, setIsSelectingRegion] = useState(false);
+
+  const [frequencies, setFrequencies] = useState<number[]>([]);
+  const [peaks, setPeaks] = useState<number[]>([]);
+
+  const ref = useRef(null);
+
+  const removeComparingPitchContour = () => {
+    if (!wavesurfer) return;
+
+    regions
+      .getRegions()
+      .find((r) => r.id.startsWith("recording-voice-region"))
+      ?.remove();
+
+    const wrapper = (wavesurfer as any).renderer.getWrapper();
+    wrapper
+      .querySelectorAll(".pitch-contour-recording")
+      .forEach((el: HTMLDivElement) => el.remove());
+  };
+
+  /*
+   * Render recording's pitch contour on the original audio waveform
+   * with the original pitch contour.
+   */
+  const renderComparingPitchContour = () => {
+    const region = mediaRegions
+      .getRegions()
+      .find((r) => r.id.startsWith("segment-region"));
+    if (!region) return;
+
+    if (!frequencies || !peaks) return;
+
+    // Trim the peaks from start to end, so we can render the voicable part of the recording
+    const minValue = 0.01;
+    let voiceStartIndex = 0;
+    let voiceEndIndex = peaks.length - 1;
+
+    for (let i = 1; i < voiceEndIndex; i++) {
+      if (peaks[i] >= minValue) {
+        voiceStartIndex = i;
+        break;
+      }
+    }
+    for (let i = voiceEndIndex; i > voiceStartIndex; i--) {
+      if (peaks[i] >= minValue) {
+        voiceEndIndex = i;
+        break;
+      }
+    }
+    const voiceStartFrequenciesIndex = Math.round(
+      ((1.0 * voiceStartIndex) / peaks.length) * frequencies.length
+    );
+    const voiceEndFrequenciesIndex = Math.round(
+      ((1.0 * voiceEndIndex) / peaks.length) * frequencies.length
+    );
+
+    regions.clearRegions();
+    regions.addRegion({
+      id: `recording-voice-region-${currentRecording.id}`,
+      start: (voiceStartIndex / peaks.length) * player.getDuration(),
+      end: (voiceEndIndex / peaks.length) * player.getDuration(),
+      color: "#fb6f9211",
+      drag: false,
+      resize: false,
+    });
+
+    const data = frequencies.slice(
+      voiceStartFrequenciesIndex,
+      voiceEndFrequenciesIndex
+    );
+    renderMediaPitchContour(region, {
+      repaint: false,
+      canvasId: `pitch-contour-${currentRecording.id}-canvas`,
+      containerClassNames: ["pitch-contour-recording"],
+      data: {
+        labels: new Array(data.length).fill(""),
+        datasets: [
+          {
+            data,
+            cubicInterpolationMode: "monotone",
+            borderColor: "#fb6f92",
+            pointBorderColor: "#fb6f92",
+            pointBackgroundColor: "#ff8fab",
+          },
+        ],
+      },
+    });
+  };
+
+  const toggleCompare = () => {
+    if (isComparing) {
+      removeComparingPitchContour();
+      setIsComparing(false);
+    } else {
+      setIsComparing(true);
+      renderComparingPitchContour();
+    }
+  };
+
+  const handleShare = async () => {
+    if (!currentRecording.uploadedAt) {
+      try {
+        await EnjoyApp.recordings.upload(currentRecording.id);
+      } catch (error) {
+        toast.error(t("shareFailed"), { description: error.message });
+        return;
+      }
+    }
+
+    webApi
+      .createPost({
+        targetId: currentRecording.id,
+        targetType: "Recording",
+      })
+      .then(() => {
+        toast.success(t("sharedSuccessfully"), {
+          description: t("sharedRecording"),
+        });
+      })
+      .catch((error) => {
+        toast.error(t("shareFailed"), {
+          description: error.message,
+        });
+      });
+  };
+
+  useEffect(() => {
+    if (!ref.current) return;
+    if (isRecording) return;
+    if (!currentRecording?.src) return;
+
+    const ws = WaveSurfer.create({
+      container: ref.current,
+      url: currentRecording.src,
+      height,
+      barWidth: 2,
+      cursorWidth: 1,
+      autoCenter: true,
+      autoScroll: true,
+      minPxPerSec: 150,
+      waveColor: "#efefef",
+      normalize: false,
+      progressColor: "rgba(0, 0, 0, 0.1)",
+    });
+
+    setPlayer(ws);
+
+    const regions = ws.registerPlugin(Regions.create());
+    setRegions(regions);
+
+    ws.on("timeupdate", (time: number) => setCurrentTime(time));
+
+    ws.on("finish", () => ws.seekTo(0));
+
+    ws.on("ready", () => {
+      const peaks: Float32Array = ws.getDecodedData().getChannelData(0);
+      const sampleRate = ws.options.sampleRate;
+      const data = extractFrequencies({ peaks, sampleRate });
+      setFrequencies(data);
+      setPeaks(Array.from(peaks));
+
+      renderPitchContour({
+        wrapper: ws.getWrapper(),
+        canvasId: `pitch-contour-${currentRecording.id}-canvas`,
+        labels: new Array(data.length).fill(""),
+        datasets: [
+          {
+            data,
+            cubicInterpolationMode: "monotone",
+            borderColor: "#fb6f92",
+            pointBorderColor: "#fb6f92",
+            pointBackgroundColor: "#ff8fab",
+          },
+        ],
+      });
+    });
+
+    return () => {
+      ws.destroy();
+    };
+  }, [ref, currentRecording, isRecording]);
+
+  useEffect(() => {
+    setIsComparing(false);
+    removeComparingPitchContour();
+  }, [currentRecording]);
+
+  useEffect(() => {
+    if (!isComparing) return;
+
+    if (editingRegion) {
+      setIsComparing(false);
+    } else {
+      setTimeout(() => {
+        renderComparingPitchContour();
+      }, 100);
+    }
+  }, [zoomRatio, editingRegion]);
+
+  useEffect(() => {
+    if (!regions) return;
+
+    let disableSelectingRegion: () => void | undefined;
+    if (isSelectingRegion) {
+      regions.clearRegions();
+      disableSelectingRegion = regions.enableDragSelection({
+        color: "rgba(76, 201, 240, 0.2)",
+        drag: false,
+      });
+    }
+
+    const subscriptions = [
+      regions.on("region-created", () => {}),
+
+      regions.on("region-clicked", (region, e) => {
+        e.stopPropagation();
+        region.play();
+      }),
+
+      regions.on("region-out", () => {
+        player.pause();
+      }),
+    ];
+
+    return () => {
+      disableSelectingRegion && disableSelectingRegion();
+      regions.clearRegions();
+      subscriptions.forEach((unsub) => unsub());
+    };
+  }, [regions, isSelectingRegion, player]);
+
+  /*
+   * Update player styles
+   */
+  useEffect(() => {
+    if (!ref?.current || !player) return;
+
+    const scrollContainer = player.getWrapper()?.closest(".scroll");
+    if (!scrollContainer) return;
+
+    scrollContainer.style.width = `${
+      ref.current.getBoundingClientRect().width
+    }px`;
+    scrollContainer.style.scrollbarWidth = "thin";
+  }, [ref, player]);
+
+  /*
+   * play recording along with the media when isComparing is true
+   * only when the media is playing and the active region is the segment region
+   */
+  useEffect(() => {
+    if (!regions) return;
+    if (!isComparing) return;
+    if (!wavesurfer?.isPlaying()) return;
+    if (player?.isPlaying()) return;
+    if (!mediaActiveRegion?.id?.startsWith("segment-region")) return;
+
+    regions
+      .getRegions()
+      .find((r) => r.id.startsWith("recording-voice-region"))
+      ?.play();
+  }, [
+    wavesurfer,
+    player,
+    regions,
+    isComparing,
+    mediaCurrentTime,
+    mediaActiveRegion,
+  ]);
+
+  useHotkeys(
+    ["Ctrl+R", "Meta+R"],
+    (keyboardEvent, hotkeyEvent) => {
+      if (!player) return;
+      keyboardEvent.preventDefault();
+
+      if (
+        (navigator.platform.includes("Mac") && hotkeyEvent.meta) ||
+        hotkeyEvent.ctrl
+      ) {
+        document.getElementById("recording-play-or-pause-button").click();
+      }
+    },
+    [player]
+  );
+
+  if (isRecording) return <MediaRecorder />;
+  if (!currentRecording?.src)
+    return (
+      <div className="h-full w-full border rounded-xl shadow-lg flex items-center justify-center">
+        <div
+          className="m-auto"
+          dangerouslySetInnerHTML={{
+            __html: t("noRecordingForThisSegmentYet"),
+          }}
+        ></div>
+      </div>
+    );
+
+  return (
+    <div className="flex space-x-4">
+      <div className="border rounded-xl shadow-lg flex-1 relative">
+        <div ref={ref}></div>
+
+        <div className="absolute right-2 top-1">
+          <span className="text-sm">{formatDuration(currentTime || 0)}</span>
+          <span className="mx-1">/</span>
+          <span className="text-sm">
+            {formatDuration(
+              player?.getDuration() || currentRecording.duration / 1000.0 || 0
+            )}
+          </span>
+        </div>
+      </div>
+
+      <div className="flex flex-col space-y-1.5">
+        <Button
+          variant="default"
+          size="icon"
+          id="recording-play-or-pause-button"
+          data-tooltip-id="media-player-controls-tooltip"
+          data-tooltip-content={t("playRecording")}
+          className="rounded-full w-8 h-8 p-0"
+          onClick={() => {
+            const region = regions
+              ?.getRegions()
+              ?.find((r) => r.id.startsWith("recording-voice-region"));
+
+            if (region) {
+              region.play();
+            } else {
+              player?.playPause();
+            }
+          }}
+        >
+          {player?.isPlaying() ? (
+            <PauseIcon className="w-4 h-4" />
+          ) : (
+            <PlayIcon className="w-4 h-4" />
+          )}
+        </Button>
+
+        <Button
+          variant={isComparing ? "secondary" : "outline"}
+          size="icon"
+          data-tooltip-id="media-player-controls-tooltip"
+          data-tooltip-content={t("compare")}
+          className="rounded-full w-8 h-8 p-0"
+          onClick={toggleCompare}
+        >
+          <GitCompareIcon className="w-4 h-4" />
+        </Button>
+
+        <Button
+          variant={isSelectingRegion ? "secondary" : "outline"}
+          size="icon"
+          data-tooltip-id="media-player-controls-tooltip"
+          data-tooltip-content={t("selectRegion")}
+          className="rounded-full w-8 h-8 p-0"
+          onClick={() => setIsSelectingRegion(!isSelectingRegion)}
+        >
+          <TextCursorInputIcon className="w-4 h-4" />
+        </Button>
+
+        <DropdownMenu>
+          <DropdownMenuTrigger asChild>
+            <Button
+              variant="outline"
+              size="icon"
+              data-tooltip-id="media-player-controls-tooltip"
+              data-tooltip-content={t("more")}
+              className="rounded-full w-8 h-8 p-0"
+            >
+              <MoreVerticalIcon className="w-4 h-4" />
+            </Button>
+          </DropdownMenuTrigger>
+
+          <DropdownMenuContent>
+            <DropdownMenuItem
+              className="cursor-pointer"
+              onClick={() => setDetailIsOpen(true)}
+            >
+              <GaugeCircleIcon
+                className={`w-4 h-4 mr-4
+                    ${
+                      currentRecording.pronunciationAssessment
+                        ? currentRecording.pronunciationAssessment
+                            .pronunciationScore >= 80
+                          ? "text-green-500"
+                          : currentRecording.pronunciationAssessment
+                              .pronunciationScore >= 60
+                          ? "text-yellow-600"
+                          : "text-red-500"
+                        : ""
+                    }
+                    `}
+              />
+              <span>{t("pronunciationAssessment")}</span>
+            </DropdownMenuItem>
+
+            <DropdownMenuItem
+              className="cursor-pointer"
+              onClick={() => setIsSharing(true)}
+            >
+              <Share2Icon className="w-4 h-4 mr-4" />
+              <span>{t("share")}</span>
+            </DropdownMenuItem>
+          </DropdownMenuContent>
+        </DropdownMenu>
+      </div>
+
+      <AlertDialog open={isSharing} onOpenChange={setIsSharing}>
+        <AlertDialogContent>
+          <AlertDialogHeader>
+            <AlertDialogTitle>{t("shareRecording")}</AlertDialogTitle>
+            <AlertDialogDescription>
+              {t("areYouSureToShareThisRecordingToCommunity")}
+            </AlertDialogDescription>
+          </AlertDialogHeader>
+          <AlertDialogFooter>
+            <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
+            <AlertDialogAction asChild>
+              <Button onClick={handleShare}>{t("share")}</Button>
+            </AlertDialogAction>
+          </AlertDialogFooter>
+        </AlertDialogContent>
+      </AlertDialog>
+      <Sheet open={detailIsOpen} onOpenChange={(open) => setDetailIsOpen(open)}>
+        <SheetContent
+          side="bottom"
+          className="rounded-t-2xl shadow-lg"
+          displayClose={false}
+        >
+          <SheetHeader className="flex items-center justify-center -mt-4 mb-2">
+            <SheetClose>
+              <ChevronDownIcon />
+            </SheetClose>
+          </SheetHeader>
+
+          <RecordingDetail recording={currentRecording} />
+        </SheetContent>
+      </Sheet>
+    </div>
+  );
+};
--- a/enjoy/src/renderer/components/medias/media-info-panel.tsx
+++ b/enjoy/src/renderer/components/medias/media-info-panel.tsx
@@ -0,0 +1,40 @@
+import { useContext } from "react";
+import { MediaPlayerProviderContext } from "@renderer/context";
+import { formatDuration, formatDateTime } from "@renderer/lib/utils";
+import { t } from "i18next";
+
+export const MediaInfoPanel = () => {
+  const { media } = useContext(MediaPlayerProviderContext);
+  if (!media) return null;
+
+  return (
+    <div className="px-4" data-testid="media-info-panel">
+      {[
+        { label: t("models.audio.name"), value: media.name },
+        {
+          label: t("models.audio.duration"),
+          value: formatDuration(media.duration),
+        },
+        {
+          label: t("models.audio.recordingsCount"),
+          value: media.recordingsCount ? media.recordingsCount : 0,
+        },
+        {
+          label: t("models.audio.recordingsDuration"),
+          value: formatDuration(media.recordingsDuration, "ms"),
+        },
+        {
+          label: t("models.audio.createdAt"),
+          value: formatDateTime(media.createdAt),
+        },
+      ].map((item, index) => (
+        <div key={`media-info-item-${index}`} className="mb-2">
+          <div className="capitalize text-sm text-muted-foreground mb-1">
+            {item.label}
+          </div>
+          <div className="">{item.value}</div>
+        </div>
+      ))}
+    </div>
+  );
+};
--- a/enjoy/src/renderer/components/medias/media-loading-modal.tsx
+++ b/enjoy/src/renderer/components/medias/media-loading-modal.tsx
@@ -0,0 +1,104 @@
+import { useContext } from "react";
+import {
+  MediaPlayerProviderContext,
+  AISettingsProviderContext,
+} from "@renderer/context";
+import {
+  AlertDialog,
+  AlertDialogHeader,
+  AlertDialogDescription,
+  AlertDialogTitle,
+  AlertDialogContent,
+  AlertDialogFooter,
+  AlertDialogOverlay,
+  Button,
+  PingPoint,
+  Progress,
+} from "@renderer/components/ui";
+import { CheckCircleIcon, LoaderIcon } from "lucide-react";
+import { t } from "i18next";
+import { useNavigate } from "react-router-dom";
+
+export const MediaLoadingModal = () => {
+  const navigate = useNavigate();
+  const { whisperConfig } = useContext(AISettingsProviderContext);
+  const {
+    decoded,
+    transcription,
+    transcribing,
+    transcribingProgress,
+    generateTranscription,
+  } = useContext(MediaPlayerProviderContext);
+
+  return (
+    <AlertDialog open={!decoded || !Boolean(transcription?.result)}>
+      <AlertDialogOverlay className="z-[100]" />
+      <AlertDialogContent className="z-[100]">
+        <AlertDialogHeader>
+          <AlertDialogTitle>{t("preparingAudio")}</AlertDialogTitle>
+          <AlertDialogDescription>
+            {t("itMayTakeAWhileToPrepareForTheFirstLoad")}
+          </AlertDialogDescription>
+        </AlertDialogHeader>
+
+        <div className="py-4">
+          {decoded ? (
+            <div className="mb-4 flex items-center space-x-4">
+              <CheckCircleIcon className="w-4 h-4 text-green-500" />
+              <span>{t("waveformIsDecoded")}</span>
+            </div>
+          ) : (
+            <div className="mb-4 flex items-center space-x-4">
+              <LoaderIcon className="w-4 h-4 animate-spin" />
+              <span>{t("decodingWaveform")}</span>
+            </div>
+          )}
+
+          {!transcription ? (
+            <div className="flex items-center space-x-4">
+              <LoaderIcon className="w-4 h-4 animate-spin" />
+              <span>{t("loadingTranscription")}</span>
+            </div>
+          ) : transcription.result ? (
+            <div className="flex items-center space-x-4">
+              <CheckCircleIcon className="w-4 h-4 text-green-500" />
+              <span>{t("transcribedSuccessfully")}</span>
+            </div>
+          ) : transcribing ? (
+            <div className="">
+              <div className="flex items-center space-x-4 mb-2">
+                <PingPoint colorClassName="bg-yellow-500" />
+                <span>{t("transcribing")}</span>
+              </div>
+              {whisperConfig.service === "local" && (
+                <Progress value={transcribingProgress} />
+              )}
+            </div>
+          ) : (
+            <div className="flex items-center space-x-4">
+              <PingPoint colorClassName="bg-muted" />
+              <div className="inline">
+                <span>{t("notTranscribedYet")}</span>
+                {decoded && (
+                  <Button
+                    onClick={generateTranscription}
+                    className="ml-4"
+                    size="sm"
+                  >
+                    {t("transcribe")}
+                  </Button>
+                )}
+              </div>
+            </div>
+          )}
+        </div>
+
+        <AlertDialogFooter>
+          <Button variant="secondary" onClick={() => navigate(-1)}>
+            {t("cancel")}
+          </Button>
+        </AlertDialogFooter>
+      </AlertDialogContent>
+    </AlertDialog>
+  );
+};
--- a/enjoy/src/renderer/components/medias/media-player-controls.tsx
+++ b/enjoy/src/renderer/components/medias/media-player-controls.tsx
--- a/enjoy/src/renderer/components/medias/media-player.tsx
+++ b/enjoy/src/renderer/components/medias/media-player.tsx
@@ -1,19 +1,5 @@
-import { useEffect, useState, useCallback, useRef, useContext } from "react";
-import {
-  extractFrequencies,
-  PitchContour,
-  MediaPlayerControls,
-  MediaCaption,
-} from "@renderer/components";
-import Regions, {
-  Region,
-  type Region as RegionType,
-} from "wavesurfer.js/dist/plugins/regions";
-import { secondsToTimestamp } from "@renderer/lib/utils";
-import WaveSurfer from "wavesurfer.js";
-import { useDebounce } from "@uidotdev/usehooks";
-import { AppSettingsProviderContext } from "@renderer/context";
-import cloneDeep from "lodash/cloneDeep";
+import { useContext } from "react";
+import { MediaPlayerProviderContext } from "@renderer/context";
 import {
  MediaPlayer as VidstackMediaPlayer,
  MediaProvider,
@@ -23,626 +9,32 @@ import {
 } from "@vidstack/react";
 import {
  DefaultAudioLayout,
-  DefaultVideoLayout,
  defaultLayoutIcons,
 } from "@vidstack/react/player/layouts/default";
-import { useHotkeys } from "react-hotkeys-hook";

-const minPxPerSecBase = 150;
-
-export const MediaPlayer = (props: {
-  mediaId: string;
-  mediaType: "Audio" | "Video";
-  mediaUrl: string;
-  mediaMd5?: string;
-  transcription: TranscriptionType;
-  // player controls
-  currentTime: number;
-  setCurrentTime: (time: number) => void;
-  currentSegmentIndex: number;
-  setCurrentSegmentIndex: (index: number) => void;
-  initialized: boolean;
-  setInitialized: (value: boolean) => void;
-  recordButtonVisible?: boolean;
-  setRecordButtonVisible?: (value: boolean) => void;
-  seek?: {
-    seekTo: number;
-    timestamp: number;
-  };
-  height?: number;
-  zoomRatio: number;
-  setZoomRatio: (value: number) => void;
-  isPlaying: boolean;
-  setIsPlaying: (value: boolean) => void;
-  playMode?: "loop" | "single" | "all";
-  setPlayMode?: (value: "loop" | "single" | "all") => void;
-  playBackRate: number;
-  setPlaybackRate: (value: number) => void;
-  displayInlineCaption?: boolean;
-  setDisplayInlineCaption?: (value: boolean) => void;
-  onShare?: () => void;
-  onDecoded?: (data: { duration: number; sampleRate: number }) => void;
-}) => {
-  const { EnjoyApp } = useContext(AppSettingsProviderContext);
-  const {
-    mediaId,
-    mediaType,
-    mediaUrl,
-    mediaMd5,
-    transcription,
-    height = 200,
-    currentTime,
-    setCurrentTime,
-    currentSegmentIndex,
-    setCurrentSegmentIndex,
-    initialized,
-    setInitialized,
-    recordButtonVisible,
-    setRecordButtonVisible,
-    seek,
-    zoomRatio,
-    setZoomRatio,
-    isPlaying,
-    setIsPlaying,
-    playMode,
-    setPlayMode,
-    playBackRate,
-    setPlaybackRate,
-    displayInlineCaption,
-    setDisplayInlineCaption,
-    onShare,
-    onDecoded,
-  } = props;
-  if (!mediaUrl) return;
-
-  const [wavesurfer, setWavesurfer] = useState(null);
-  const [waveform, setWaveForm] = useState<WaveFormDataType>(null);
-  const containerRef = useRef<HTMLDivElement>();
-  const [mediaProvider, setMediaProvider] = useState<
-    HTMLAudioElement | HTMLVideoElement
-  >(null);
+export const MediaPlayer = () => {
+  const { media, setMediaProvider } = useContext(MediaPlayerProviderContext);
  const mediaRemote = useMediaRemote();
-
-  const [transcriptionResult, setTranscriptionResult] = useState<
-    TranscriptionResultSegmentGroupType[] | null
-  >(null);
-
-  const [transcriptionDirty, setTranscriptionDirty] = useState<boolean>(false);
-  const [regions, setRegions] = useState<Regions | null>(null);
-
-  const debouncedTRanscription = useDebounce(transcriptionResult, 500);
-
-  const resetTranscription = () => {
-    if (!transcriptionDirty) return;
-    if (!transcription?.result) return;
-
-    setTranscriptionResult(cloneDeep(transcription.result));
-    setTranscriptionDirty(false);
-  };
-
-  const saveTranscription = () => {
-    if (!transcriptionDirty) return;
-    if (!debouncedTRanscription) return;
-
-    EnjoyApp.transcriptions.update(transcription.id, {
-      result: debouncedTRanscription,
-    });
-  };
-
-  const onPlayClick = useCallback(() => {
-    wavesurfer.isPlaying() ? wavesurfer.pause() : wavesurfer.play();
-  }, [wavesurfer]);
-
-  const handlePlaybackRateChange = useCallback(
-    (rate: number) => {
-      wavesurfer.setPlaybackRate(rate);
-      setPlaybackRate(wavesurfer.getPlaybackRate());
-    },
-    [initialized]
-  );
-
-  const findCurrentSegment = (time: number) => {
-    if (!transcription) return;
-    if (isPlaying && playMode === "loop") return;
-
-    time = Math.round(time * 1000);
-    const index = transcriptionResult.findIndex(
-      (t) => time >= t.offsets.from && time < t.offsets.to
-    );
-    if (index === -1) return;
-    setCurrentSegmentIndex(index);
-  };
-
-  const addSegmentRegion = (from: number, to: number) => {
-    if (!initialized) return;
-
-    const span = document.createElement("span");
-    span.innerText = secondsToTimestamp(from) + ` (${(to - from).toFixed(2)}s)`;
-    span.style.padding = "1rem";
-    span.style.fontSize = "0.9rem";
-
-    if (regions) {
-      regions.clearRegions();
-      const region = regions.addRegion({
-        start: from,
-        end: to,
-        color: "rgba(255, 0, 0, 0.03)",
-        drag: false,
-        resize: true,
-        content: span,
-      });
-      renderPitchContour(region);
-    }
-  };
-
-  const renderPitchContour = (region: RegionType) => {
-    if (!region) return;
-    if (!waveform?.frequencies?.length) return;
-    if (!wavesurfer) return;
-
-    const duration = wavesurfer.getDuration();
-    const fromIndex = Math.round(
-      (region.start / duration) * waveform.frequencies.length
-    );
-    const toIndex = Math.round(
-      (region.end / duration) * waveform.frequencies.length
-    );
-
-    const containerId = `pitch-contour-${mediaId}-${currentSegmentIndex}`;
-    const wrapper = wavesurfer.renderer.getWrapper();
-
-    const wrapperWidth = wrapper.getBoundingClientRect().width;
-    const canvas = PitchContour({
-      frequencies: waveform.frequencies.slice(fromIndex, toIndex),
-      height,
-    });
-    const offsetLeft = (region.start / duration) * wrapperWidth;
-    const width = ((region.end - region.start) / duration) * wrapperWidth;
-    const pitchContourWidthContainer = document.createElement("div");
-    pitchContourWidthContainer.appendChild(canvas);
-
-    pitchContourWidthContainer.style.position = "absolute";
-    pitchContourWidthContainer.style.top = "0";
-    pitchContourWidthContainer.style.left = "0";
-
-    canvas.style.width = `${width}px`;
-    pitchContourWidthContainer.style.height = `${height}px`;
-    pitchContourWidthContainer.style.marginLeft = `${offsetLeft}px`;
-    pitchContourWidthContainer.className = "pitch-contour";
-    pitchContourWidthContainer.id = containerId;
-
-    const regionDuration = region.end - region.start;
-
-    if (displayInlineCaption) {
-      const captionContainer = document.createElement("div");
-      captionContainer.style.position = "absolute";
-      captionContainer.style.bottom = "0";
-      captionContainer.style.width = `${width}px`;
-      captionContainer.style.fontSize = "0.75rem";
-      captionContainer.style.opacity = "0.75";
-      transcriptionResult?.[currentSegmentIndex]?.segments?.forEach(
-        (segment, index) => {
-          const span = document.createElement("span");
-          span.innerText = segment.text;
-          span.style.position = "absolute";
-          span.style.bottom = "0";
-          span.style.left = `${
-            ((segment.offsets.from / 1000 - region.start) / regionDuration) *
-            width
-          }px`;
-          if (index % 2 === 1) {
-            span.style.paddingBottom = "0.75rem";
-          }
-
-          captionContainer.appendChild(span);
-        }
-      );
-      pitchContourWidthContainer.appendChild(captionContainer);
-    }
-
-    wrapper.querySelector("#" + containerId)?.remove();
-    wrapper.appendChild(pitchContourWidthContainer);
-  };
-
-  const reRenderPitchContour = () => {
-    if (!wavesurfer) return;
-    const wrapper = wavesurfer.renderer.getWrapper();
-    wrapper
-      .querySelectorAll(".pitch-contour")
-      .forEach((canvas: HTMLCanvasElement) => {
-        canvas.remove();
-      });
-
-    if (!regions) return;
-
-    const region = regions.getRegions()[0];
-    if (!region) return;
-
-    renderPitchContour(region);
-  };
-
-  useEffect(() => {
-    if (!transcription) return;
-    setTranscriptionDirty(false);
-
-    setTranscriptionResult(cloneDeep(transcription.result));
-  }, [transcription]);
-
-  // Initialize wavesurfer
-  const initializeWavesurfer = async () => {
-    if (!mediaProvider) return;
-    if (!containerRef.current) return;
-
-    const ws = WaveSurfer.create({
-      container: containerRef.current,
-      height,
-      waveColor: "#ddd",
-      progressColor: "rgba(0, 0, 0, 0.25)",
-      cursorColor: "#dc143c",
-      barWidth: 1,
-      autoScroll: true,
-      minPxPerSec: 150,
-      autoCenter: false,
-      dragToSeek: false,
-      media: mediaProvider,
-      peaks: waveform ? [waveform.peaks] : undefined,
-      duration: waveform ? waveform.duration : undefined,
-    });
-
-    const blob = await fetch(mediaUrl).then((res) => res.blob());
-
-    if (waveform) {
-      ws.loadBlob(blob, [waveform.peaks], waveform.duration);
-      setInitialized(true);
-    } else {
-      ws.loadBlob(blob);
-    }
-
-    setRegions(ws.registerPlugin(Regions.create()));
-    setWavesurfer(ws);
-  };
-
-  useEffect(() => {
-    initializeWavesurfer();
-
-    return () => {
-      wavesurfer?.destroy();
-    };
-  }, [mediaUrl, height, mediaProvider]);
-
-  // Install listeners for wavesurfer
-  useEffect(() => {
-    if (!wavesurfer) return;
-    setCurrentTime(0);
-    setIsPlaying(false);
-
-    const subscriptions = [
-      wavesurfer.on("play", () => setIsPlaying(true)),
-      wavesurfer.on("pause", () => setIsPlaying(false)),
-      wavesurfer.on("loading", (percent: number) => console.log(`${percent}%`)),
-      wavesurfer.on("timeupdate", (time: number) => setCurrentTime(time)),
-      wavesurfer.on("decode", () => {
-        if (waveform?.frequencies) return;
-
-        const peaks: Float32Array = wavesurfer
-          .getDecodedData()
-          .getChannelData(0);
-        const duration: number = wavesurfer.getDuration();
-        const sampleRate = wavesurfer.options.sampleRate;
-        const _frequencies = extractFrequencies({ peaks, sampleRate });
-        const _waveform = {
-          peaks: Array.from(peaks),
-          duration,
-          sampleRate,
-          frequencies: _frequencies,
-        };
-        EnjoyApp.waveforms.save(mediaMd5, _waveform);
-        setWaveForm(_waveform);
-        onDecoded &&
-          onDecoded({
-            duration,
-            sampleRate,
-          });
-      }),
-      wavesurfer.on("ready", () => {
-        setInitialized(true);
-      }),
-    ];
-
-    return () => {
-      subscriptions.forEach((unsub) => unsub());
-    };
-  }, [wavesurfer]);
-
-  useEffect(() => {
-    if (!transcriptionResult) return;
-    if (transcriptionDirty) return;
-
-    const currentSegment = transcriptionResult[currentSegmentIndex];
-    if (!currentSegment) return;
-
-    addSegmentRegion(
-      currentSegment.offsets.from / 1000.0,
-      currentSegment.offsets.to / 1000.0
-    );
-
-    // set zoom ratio to fit the current segment
-    if (!isPlaying) {
-      setZoomRatio(calcFitZoomRatio());
-    }
-  }, [
-    currentSegmentIndex,
-    initialized,
-    transcriptionDirty,
-    transcriptionResult,
-  ]);
-
-  useEffect(() => {
-    if (!transcriptionResult) return;
-
-    findCurrentSegment(currentTime);
-  }, [currentTime, transcriptionResult]);
-
-  useEffect(() => {
-    if (!regions) return;
-
-    const subscriptions = [
-      wavesurfer.on("finish", () => {
-        if (playMode !== "loop") return;
-
-        regions?.getRegions()[0]?.play();
-      }),
-
-      regions.on("region-updated", (region) => {
-        const from = region.start;
-        const to = region.end;
-
-        const offsets = {
-          from: Math.round(from * 1000),
-          to: Math.round(to * 1000),
-        };
-
-        const timestamps = {
-          from: [
-            secondsToTimestamp(from),
-            Math.round((from * 1000) % 1000),
-          ].join(","),
-          to: [secondsToTimestamp(to), Math.round((to * 1000) % 1000)].join(
-            ","
-          ),
-        };
-
-        const _transcription = cloneDeep(transcriptionResult);
-        _transcription[currentSegmentIndex].offsets = offsets;
-        _transcription[currentSegmentIndex].timestamps = timestamps;
-
-        // ensure that the previous segment ends before the current segment
-        if (
-          currentSegmentIndex > 0 &&
-          _transcription[currentSegmentIndex - 1].offsets.to > offsets.from
-        ) {
-          _transcription[currentSegmentIndex - 1].offsets.to = offsets.from;
-        }
-
-        // ensure that the next segment starts after the current segment
-        if (
-          currentSegmentIndex < _transcription.length - 1 &&
-          _transcription[currentSegmentIndex + 1].offsets.from < offsets.to
-        ) {
-          _transcription[currentSegmentIndex + 1].offsets.from = offsets.to;
-        }
-
-        setTranscriptionResult(_transcription);
-        setTranscriptionDirty(true);
-
-        renderPitchContour(region);
-      }),
-      regions.on("region-out", (region: Region) => {
-        if (isPlaying && playMode === "loop") {
-          region.play();
-        } else if (isPlaying && playMode === "single") {
-          wavesurfer.pause();
-          wavesurfer.seekTo(region.start / wavesurfer.getDuration());
-        } else {
-          resetTranscription();
-        }
-      }),
-    ];
-
-    return () => {
-      subscriptions.forEach((unsub) => unsub());
-    };
-  }, [regions, isPlaying, playMode, currentSegmentIndex, transcriptionDirty]);
-
-  useEffect(() => {
-    if (!wavesurfer) return;
-    if (!initialized) return;
-
-    wavesurfer.zoom(zoomRatio * minPxPerSecBase);
-    reRenderPitchContour();
-  }, [zoomRatio, wavesurfer, initialized, displayInlineCaption]);
-
-  useEffect(() => {
-    if (typeof seek?.seekTo !== "number") return;
-    if (!wavesurfer) return;
-    if (!initialized) return;
-
-    wavesurfer.seekTo(seek?.seekTo / wavesurfer.getDuration());
-    wavesurfer.setScrollTime(seek?.seekTo);
-  }, [seek, wavesurfer, initialized]);
-
-  // Handle media provider
-  useEffect(() => {
-    if (!mediaRemote) return;
-    if (!mediaProvider) return;
-
-    if (mediaType !== "Video") return;
-    if (recordButtonVisible) {
-      mediaRemote.togglePictureInPicture();
-    } else {
-      mediaRemote.exitPictureInPicture();
-    }
-  }, [mediaRemote, mediaProvider, recordButtonVisible]);
-
-  useEffect(() => {
-    if (!wavesurfer) return;
-
-    if (isPlaying) {
-      wavesurfer.play();
-    } else {
-      wavesurfer.pause();
-    }
-  }, [wavesurfer, isPlaying]);
-
-  useEffect(() => {
-    EnjoyApp.waveforms.find(mediaMd5).then((waveform) => {
-      setWaveForm(waveform);
-      onDecoded &&
-        onDecoded({
-          duration: waveform.duration,
-          sampleRate: waveform.sampleRate,
-        });
-    });
-  }, []);
-
-  const calcFitZoomRatio = () => {
-    if (!containerRef.current) return;
-    if (!wavesurfer) return;
-
-    const currentSegment = transcriptionResult?.[currentSegmentIndex];
-    if (!currentSegment) return;
-
-    const containerWidth = containerRef.current.getBoundingClientRect().width;
-
-    const duration =
-      currentSegment.offsets.to / 1000.0 - currentSegment.offsets.from / 1000.0;
-    const fitZoomRatio = containerWidth / duration / minPxPerSecBase;
-
-    return fitZoomRatio;
-  };
-
-  useHotkeys(
-    "Space",
-    (keyboardEvent, _hotkeyEvent) => {
-      if (!wavesurfer) return;
-
-      keyboardEvent.preventDefault();
-      onPlayClick();
-    },
-    [wavesurfer]
-  );
+  if (!media?.src) return null;

  return (
-    <>
-      <div
-        className="mb-2"
-        ref={containerRef}
-        data-testid="media-player-container"
-      />
-      <div className="mb-2 flex justify-center">
-        <MediaPlayerControls
-          isPlaying={isPlaying}
-          onPlayOrPause={onPlayClick}
-          onNext={() => {
-            if (!transcription) return;
-
-            const segment = transcription?.result?.[currentSegmentIndex + 1];
-            if (!segment) return;
-
-            wavesurfer.seekTo(
-              segment.offsets.from / 1000 / wavesurfer.getDuration()
-            );
-          }}
-          onPrev={() => {
-            if (!transcription) return;
-
-            const segment = transcription?.result?.[currentSegmentIndex - 1];
-            if (!segment) return;
-
-            wavesurfer.seekTo(
-              segment.offsets.from / 1000 / wavesurfer.getDuration()
-            );
-          }}
-          playMode={playMode}
-          setPlayMode={setPlayMode}
-          playbackRate={playBackRate}
-          setPlaybackRate={handlePlaybackRateChange}
-          zoomRatio={zoomRatio}
-          setZoomRatio={setZoomRatio}
-          fitZoomRatio={calcFitZoomRatio()}
-          recordButtonVisible={recordButtonVisible}
-          setRecordButtonVisible={setRecordButtonVisible}
-          transcriptionDirty={transcriptionDirty}
-          resetTranscription={resetTranscription}
-          saveTranscription={saveTranscription}
-          wavesurferOptions={wavesurfer?.options}
-          setWavesurferOptions={(options) => wavesurfer?.setOptions(options)}
-          displayInlineCaption={displayInlineCaption}
-          setDisplayInlineCaption={setDisplayInlineCaption}
-          onShare={onShare}
-        />
-      </div>
-
-      {initialized && (
-        <div className={recordButtonVisible && mediaProvider ? "" : "hidden"}>
-          <MediaCaption
-            key={`${mediaId}-${currentSegmentIndex}`}
-            mediaId={mediaId}
-            mediaType={mediaType}
-            currentTime={currentTime}
-            transcription={transcriptionResult?.[currentSegmentIndex]}
-            onSeek={(time) => {
-              wavesurfer.seekTo(time / wavesurfer.getDuration());
-            }}
-            isPlaying={isPlaying}
-            setIsPlaying={setIsPlaying}
-          />
-        </div>
-      )}
-
-      <div
-        className={recordButtonVisible && mediaProvider ? "hidden" : "flex-1"}
+    <div className="px-4" data-testid="media-player">
+      <VidstackMediaPlayer
+        controls
+        src={media.src}
+        onCanPlayThrough={(detail, nativeEvent) => {
+          mediaRemote.setTarget(nativeEvent.target);
+          const { provider } = detail;
+          if (isAudioProvider(provider)) {
+            setMediaProvider(provider.audio);
+          } else if (isVideoProvider(provider)) {
+            setMediaProvider(provider.video);
+          }
+        }}
      >
-        <VidstackMediaPlayer
-          src={mediaUrl}
-          onCanPlayThrough={(detail, nativeEvent) => {
-            mediaRemote.setTarget(nativeEvent.target);
-            const { provider } = detail;
-            if (isAudioProvider(provider)) {
-              setMediaProvider(provider.audio);
-            } else if (isVideoProvider(provider)) {
-              setMediaProvider(provider.video);
-            }
-          }}
-        >
-          <MediaProvider />
-
-          {mediaType === "Audio" && (
-            <DefaultAudioLayout icons={defaultLayoutIcons} />
-          )}
-
-          {mediaType === "Video" && (
-            <>
-              <DefaultVideoLayout icons={defaultLayoutIcons} />
-              <div className="vds-captions">
-                <div className="absolute mx-auto bottom-[15%] flex items-center justify-center w-full">
-                  <div className="flex">
-                    <MediaCaption
-                      mediaId={mediaId}
-                      mediaType={mediaType}
-                      className="mx-auto w-5/6 text-center bg-primary/70 text-xl text-white"
-                      transcription={transcriptionResult?.[currentSegmentIndex]}
-                      currentTime={currentTime}
-                      isPlaying={isPlaying}
-                      setIsPlaying={setIsPlaying}
-                    />
-                  </div>
-                </div>
-              </div>
-            </>
-          )}
-        </VidstackMediaPlayer>
-      </div>
-    </>
+        <MediaProvider />
+        <DefaultAudioLayout icons={defaultLayoutIcons} />
+      </VidstackMediaPlayer>
+    </div>
  );
 };
--- a/enjoy/src/renderer/components/medias/media-recorder.tsx
+++ b/enjoy/src/renderer/components/medias/media-recorder.tsx
@@ -0,0 +1,145 @@
+import { useEffect, useState, useContext, useRef } from "react";
+import {
+  MediaPlayerProviderContext,
+  AppSettingsProviderContext,
+} from "@renderer/context";
+import RecordPlugin from "wavesurfer.js/dist/plugins/record";
+import WaveSurfer from "wavesurfer.js";
+import { t } from "i18next";
+import { useTranscribe } from "@renderer/hooks";
+import { toast } from "@renderer/components/ui";
+import {
+  FFMPEG_TRIM_SILENCE_OPTIONS,
+  FFMPEG_CONVERT_WAV_OPTIONS,
+} from "@/constants";
+
+export const MediaRecorder = (props: { height?: number }) => {
+  const { height = 192 } = props;
+  const {
+    media,
+    isRecording,
+    setIsRecording,
+    transcription,
+    currentSegmentIndex,
+  } = useContext(MediaPlayerProviderContext);
+  const [access, setAccess] = useState<boolean>(false);
+  const [duration, setDuration] = useState<number>(0);
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const { transcode } = useTranscribe();
+
+  const ref = useRef(null);
+
+  const askForMediaAccess = () => {
+    EnjoyApp.system.preferences.mediaAccess("microphone").then((access) => {
+      if (access) {
+        setAccess(true);
+      } else {
+        setAccess(false);
+        toast.warning(t("noMicrophoneAccess"));
+      }
+    });
+  };
+
+  const createRecording = async (params: { blob: Blob; duration: number }) => {
+    if (!media) return;
+
+    const { blob, duration } = params;
+
+    toast.promise(
+      async () => {
+        let output: Blob;
+        output = await transcode(blob, [
+          // ...FFMPEG_TRIM_SILENCE_OPTIONS,
+          ...FFMPEG_CONVERT_WAV_OPTIONS,
+        ]);
+
+        const currentSegment =
+          transcription?.result?.timeline?.[currentSegmentIndex];
+        if (!currentSegment) return;
+
+        return EnjoyApp.recordings.create({
+          targetId: media.id,
+          targetType: media.mediaType,
+          blob: {
+            type: output.type.split(";")[0],
+            arrayBuffer: await output.arrayBuffer(),
+          },
+          referenceId: currentSegmentIndex,
+          referenceText: currentSegment.text,
+          duration,
+        });
+      },
+      {
+        loading: t("savingRecording"),
+        success: t("recordingSaved"),
+        error: (e) => t("failedToSaveRecording" + " : " + e.message),
+        position: "bottom-right",
+      },
+    );
+  };
+
+  useEffect(() => {
+    if (!access) return;
+    if (!isRecording) return;
+    if (!ref.current) return;
+
+    const ws = WaveSurfer.create({
+      container: ref.current,
+      fillParent: true,
+      height,
+      autoCenter: false,
+      normalize: false,
+    });
+
+    const record = ws.registerPlugin(RecordPlugin.create());
+    let startAt = 0;
+
+    record.on("record-start", () => {
+      startAt = Date.now();
+    });
+
+    record.on("record-end", async (blob: Blob) => {
+      createRecording({ blob, duration: Date.now() - startAt });
+    });
+    let interval: NodeJS.Timeout;
+
+    RecordPlugin.getAvailableAudioDevices()
+      .then((devices) => devices.find((d) => d.kind === "audioinput"))
+      .then((device) => {
+        if (device) {
+          record.startRecording({ deviceId: device.deviceId });
+          setDuration(0);
+          interval = setInterval(() => {
+            setDuration((duration) => {
+              if (duration >= 300) {
+                setIsRecording(false);
+              }
+              return duration + 1;
+            });
+          }, 100);
+        } else {
+          toast.error(t("cannotFindMicrophone"));
+        }
+      });
+
+    return () => {
+      clearInterval(interval);
+      record.stopRecording();
+      ws.destroy();
+    };
+  }, [ref, isRecording, access]);
+
+  useEffect(() => {
+    askForMediaAccess();
+  }, []);
+
+  return (
+    <div className="border rounded-xl shadow-lg relative">
+      <span className="absolute bottom-2 right-2 serif">
+        {duration / 10}
+        <span className="text-xs"> / 300</span>
+      </span>
+      <div className="h-full" ref={ref}></div>
+    </div>
+  );
+};
--- a/enjoy/src/renderer/components/medias/media-recordings.tsx
+++ b/enjoy/src/renderer/components/medias/media-recordings.tsx
@@ -0,0 +1,149 @@
+import { useContext, useRef, useEffect, useState } from "react";
+import {
+  AlertDialog,
+  AlertDialogHeader,
+  AlertDialogDescription,
+  AlertDialogTitle,
+  AlertDialogContent,
+  AlertDialogFooter,
+  AlertDialogCancel,
+  AlertDialogAction,
+  Button,
+  DropdownMenu,
+  DropdownMenuItem,
+  DropdownMenuTrigger,
+  DropdownMenuContent,
+  ScrollArea,
+} from "@renderer/components/ui";
+import {
+  AppSettingsProviderContext,
+  MediaPlayerProviderContext,
+} from "@renderer/context";
+import {
+  LoaderIcon,
+  MicIcon,
+  MoreHorizontalIcon,
+  Trash2Icon,
+} from "lucide-react";
+import { t } from "i18next";
+import { formatDateTime, formatDuration } from "@renderer/lib/utils";
+
+export const MediaRecordings = () => {
+  const containerRef = useRef<HTMLDivElement>();
+  const {
+    recordings = [],
+    hasMoreRecordings,
+    loadingRecordings,
+    fetchRecordings,
+    currentRecording,
+    setCurrentRecording,
+    currentSegmentIndex,
+  } = useContext(MediaPlayerProviderContext);
+
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const [selectedRecording, setSelectedRecording] = useState(null);
+
+  const handleDelete = () => {
+    if (!selectedRecording) return;
+
+    EnjoyApp.recordings.destroy(selectedRecording.id);
+  };
+
+  useEffect(() => {
+    setCurrentRecording(recordings[0]);
+  }, [currentSegmentIndex, recordings]);
+
+  return (
+    <div ref={containerRef} data-testid="media-recordings-result">
+      {recordings.length == 0 && (
+        <div
+          className="text-center px-6 py-8 text-sm text-muted-foreground"
+          dangerouslySetInnerHTML={{
+            __html: t("noRecordingForThisSegmentYet"),
+          }}
+        ></div>
+      )}
+
+      {recordings.map((recording) => (
+        <div
+          key={recording.id}
+          className={`flex items-center justify-between px-4 py-2 cursor-pointer ${
+            recording.id === currentRecording?.id ? "bg-muted" : ""
+          }`}
+          style={{
+            borderLeftColor: `#${recording.md5.substr(0, 6)}`,
+            borderLeftWidth: 3,
+          }}
+          onClick={() => {
+            setCurrentRecording(recording);
+          }}
+        >
+          <div className="flex items-center space-x-2">
+            <MicIcon className="w-4 h-4" />
+            <span>{formatDuration(recording.duration, "ms")}</span>
+          </div>
+          <div className="flex items-center space-x-2">
+            <span className="text-sm text-muted-foreground">
+              {formatDateTime(recording.createdAt)}
+            </span>
+
+            <DropdownMenu>
+              <DropdownMenuTrigger>
+                <MoreHorizontalIcon className="w-4 h-4" />
+              </DropdownMenuTrigger>
+
+              <DropdownMenuContent>
+                <DropdownMenuItem
+                  className="text-destructive cursor-pointer"
+                  onClick={() => setSelectedRecording(recording)}
+                >
+                  <Trash2Icon className="w-4 h-4 mr-2" />
+                  <span>{t("delete")}</span>
+                </DropdownMenuItem>
+              </DropdownMenuContent>
+            </DropdownMenu>
+          </div>
+        </div>
+      ))}
+
+      {hasMoreRecordings && (
+        <div className="py-2 flex items-center justify-center">
+          <Button
+            variant="outline"
+            size="sm"
+            disabled={loadingRecordings}
+            onClick={() => fetchRecordings(recordings.length)}
+          >
+            {loadingRecordings && (
+              <LoaderIcon className="w-4 h-4 animate-spin mr-2" />
+            )}
+            {t("loadMore")}
+          </Button>
+        </div>
+      )}
+
+      <AlertDialog
+        open={selectedRecording}
+        onOpenChange={(value) => {
+          if (value) return;
+          setSelectedRecording(null);
+        }}
+      >
+        <AlertDialogContent>
+          <AlertDialogHeader>
+            <AlertDialogTitle>{t("deleteRecording")}</AlertDialogTitle>
+            <AlertDialogDescription>
+              {t("deleteRecordingConfirmation")}
+            </AlertDialogDescription>
+          </AlertDialogHeader>
+          <AlertDialogFooter>
+            <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
+            <AlertDialogAction asChild>
+              <Button onClick={handleDelete}>{t("delete")}</Button>
+            </AlertDialogAction>
+          </AlertDialogFooter>
+        </AlertDialogContent>
+      </AlertDialog>
+    </div>
+  );
+};
--- a/enjoy/src/renderer/components/medias/media-tabs.tsx
+++ b/enjoy/src/renderer/components/medias/media-tabs.tsx
@@ -0,0 +1,78 @@
+import { useEffect, useContext, useState } from "react";
+import { MediaPlayerProviderContext } from "@renderer/context";
+import {
+  MediaPlayer,
+  MediaTranscription,
+  MediaInfoPanel,
+  MediaRecordings,
+} from "@renderer/components";
+import { ScrollArea } from "@renderer/components/ui";
+import { t } from "i18next";
+
+export const MediaTabs = () => {
+  const { media, decoded } = useContext(MediaPlayerProviderContext);
+  const [tab, setTab] = useState("player");
+
+  useEffect(() => {
+    if (!decoded) return;
+
+    setTab("transcription");
+  }, [decoded]);
+
+  if (!media) return null;
+
+  return (
+    <ScrollArea className="h-full">
+      <div className="flex items-center space-x-2 justify-between p-1 bg-muted rounded-t-lg mb-2 text-sm sticky top-0 z-10">
+        {media.mediaType === "Video" && (
+          <div
+            className={`rounded cursor-pointer px-2 py-1 text-sm text-center capitalize ${
+              tab === "player" ? "bg-background" : ""
+            }`}
+            onClick={() => setTab("player")}
+          >
+            {t("player")}
+          </div>
+        )}
+
+        <div
+          className={`rounded cursor-pointer px-2 py-1 text-sm text-center capitalize ${
+            tab === "transcription" ? "bg-background" : ""
+          }`}
+          onClick={() => setTab("transcription")}
+        >
+          {t("transcription")}
+        </div>
+        <div
+          className={`rounded cursor-pointer px-2 py-1 text-sm text-center capitalize ${
+            tab === "recordings" ? "bg-background" : ""
+          }`}
+          onClick={() => setTab("recordings")}
+        >
+          {t("myRecordings")}
+        </div>
+        <div
+          className={`rounded cursor-pointer px-2 py-1 text-sm text-center capitalize ${
+            tab === "info" ? "bg-background" : ""
+          }`}
+          onClick={() => setTab("info")}
+        >
+          {t("mediaInfo")}
+        </div>
+      </div>
+
+      <div className={tab === "player" ? "" : "hidden"}>
+        <MediaPlayer />
+      </div>
+      <div className={tab === "recordings" ? "" : "hidden"}>
+        <MediaRecordings />
+      </div>
+      <div className={tab === "transcription" ? "" : "hidden"}>
+        <MediaTranscription />
+      </div>
+      <div className={tab === "info" ? "" : "hidden"}>
+        <MediaInfoPanel />
+      </div>
+    </ScrollArea>
+  );
+};
--- a/enjoy/src/renderer/components/medias/media-transcription.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription.tsx
@@ -1,4 +1,12 @@
+import { useEffect, useContext, useRef, useState } from "react";
 import {
+  AppSettingsProviderContext,
+  DbProviderContext,
+  MediaPlayerProviderContext,
+} from "@renderer/context";
+import { t } from "i18next";
+import {
+  Button,
  AlertDialog,
  AlertDialogTrigger,
  AlertDialogFooter,
@@ -8,182 +16,150 @@ import {
  AlertDialogDescription,
  AlertDialogCancel,
  AlertDialogAction,
-  Skeleton,
-  ScrollArea,
-  Button,
  PingPoint,
 } from "@renderer/components/ui";
-import React, { useEffect, useContext, useState } from "react";
-import { t } from "i18next";
 import { LoaderIcon, CheckCircleIcon, MicIcon } from "lucide-react";
-import {
-  DbProviderContext,
-  AppSettingsProviderContext,
-  AISettingsProviderContext,
-} from "@renderer/context";
+import { AlignmentResult } from "echogarden/dist/api/API.d.js";
+import { formatDuration } from "@renderer/lib/utils";

-export const MediaTranscription = (props: {
-  transcription: TranscriptionType;
-  progress: number;
-  transcribe: () => void;
-  transcribing: boolean;
-  mediaId: string;
-  mediaType: "Audio" | "Video";
-  mediaName?: string;
-  currentSegmentIndex?: number;
-  onSelectSegment?: (index: number) => void;
-}) => {
-  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
-  const { whisperConfig } = useContext(AISettingsProviderContext);
-  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+export const MediaTranscription = () => {
+  const containerRef = useRef<HTMLDivElement>();
  const {
-    transcription,
-    transcribing,
-    progress,
-    transcribe,
-    mediaId,
-    mediaType,
-    mediaName,
+    media,
    currentSegmentIndex,
-    onSelectSegment,
-  } = props;
-  const containerRef = React.createRef<HTMLDivElement>();
+    wavesurfer,
+    setCurrentSegmentIndex,
+    transcription,
+    generateTranscription,
+    transcribing,
+    transcribingProgress,
+  } = useContext(MediaPlayerProviderContext);
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const { addDblistener, removeDbListener } = useContext(DbProviderContext);

  const [recordingStats, setRecordingStats] =
    useState<SegementRecordingStatsType>([]);

  const fetchSegmentStats = async () => {
-    if (!mediaId) return;
+    if (!media) return;

-    EnjoyApp.recordings.groupBySegment(mediaId, mediaType).then((stats) => {
-      setRecordingStats(stats);
-    });
+    EnjoyApp.recordings
+      .groupBySegment(media.id, media.mediaType)
+      .then((stats) => {
+        setRecordingStats(stats);
+      });
  };

  useEffect(() => {
+    if (!transcription?.result) return;
+
    addDblistener(fetchSegmentStats);
    fetchSegmentStats();

    return () => {
      removeDbListener(fetchSegmentStats);
    };
-  }, [transcription]);
+  }, [transcription?.result]);

  useEffect(() => {
+    if (!containerRef?.current) return;
+
    containerRef.current
      ?.querySelector(`#segment-${currentSegmentIndex}`)
      ?.scrollIntoView({
        block: "center",
        inline: "center",
      } as ScrollIntoViewOptions);
-  }, [currentSegmentIndex, transcription]);
+  }, [currentSegmentIndex, transcription, containerRef]);

-  if (!transcription)
-    return (
-      <div className="p-4 w-full">
-        <TranscriptionPlaceholder />
-      </div>
-    );
+  if (!transcription?.result) {
+    return null;
+  }

  return (
-    <div
-      className="w-full h-full flex flex-col"
-      data-testid="media-transcription"
-    >
-      <div className="mb-4 flex items-cener justify-between">
-        <div className="flex items-center space-x-2">
-          {transcribing || transcription.state === "processing" ? (
-            <>
-              <PingPoint colorClassName="bg-yellow-500" />
-              <div className="text-sm">
-                {whisperConfig.service === "local" && `${progress}%`}
-              </div>
-            </>
-          ) : transcription.state === "finished" ? (
-            <CheckCircleIcon className="text-green-500 w-4 h-4" />
-          ) : (
-            <PingPoint colorClassName="bg-mute" />
-          )}
-          <span className="capitalize">{t("transcript")}</span>
+    <div ref={containerRef} data-testid="media-transcription-result">
+      <div className="px-4 py-1 bg-background">
+        <div className="flex items-cener justify-between">
+          <div className="flex items-center space-x-2">
+            {transcribing || transcription.state === "processing" ? (
+              <>
+                <PingPoint colorClassName="bg-yellow-500" />
+                <div className="text-sm">
+                  {transcribingProgress > 0 && `${transcribingProgress}%`}
+                </div>
+              </>
+            ) : transcription.state === "finished" ? (
+              <CheckCircleIcon className="text-green-500 w-4 h-4" />
+            ) : (
+              <PingPoint colorClassName="bg-mute" />
+            )}
+            <span className="capitalize">{t("transcript")}</span>
+          </div>
+          <AlertDialog>
+            <AlertDialogTrigger asChild>
+              <Button
+                variant="outline"
+                size="sm"
+                disabled={transcribing || transcription.state === "processing"}
+                className="capitalize"
+              >
+                {(transcribing || transcription.state === "processing") && (
+                  <LoaderIcon className="animate-spin w-4 mr-2" />
+                )}
+                {transcription.result ? t("regenerate") : t("transcribe")}
+              </Button>
+            </AlertDialogTrigger>
+            <AlertDialogContent>
+              <AlertDialogHeader>
+                <AlertDialogTitle>{t("transcribe")}</AlertDialogTitle>
+                <AlertDialogDescription>
+                  {t("transcribeMediaConfirmation", {
+                    name: media.name,
+                  })}
+                </AlertDialogDescription>
+              </AlertDialogHeader>
+              <AlertDialogFooter>
+                <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
+                <AlertDialogAction onClick={generateTranscription}>
+                  {t("transcribe")}
+                </AlertDialogAction>
+              </AlertDialogFooter>
+            </AlertDialogContent>
+          </AlertDialog>
        </div>
-        <AlertDialog>
-          <AlertDialogTrigger asChild>
-            <Button
-              disabled={transcribing || transcription.state === "processing"}
-              className="capitalize"
-            >
-              {(transcribing || transcription.state === "processing") && (
-                <LoaderIcon className="animate-spin w-4 mr-2" />
-              )}
-              {transcription.result ? t("regenerate") : t("transcribe")}
-            </Button>
-          </AlertDialogTrigger>
-          <AlertDialogContent>
-            <AlertDialogHeader>
-              <AlertDialogTitle>{t("transcribe")}</AlertDialogTitle>
-              <AlertDialogDescription>
-                {t("transcribeAudioConfirmation", {
-                  name: mediaName,
-                })}
-              </AlertDialogDescription>
-            </AlertDialogHeader>
-            <AlertDialogFooter>
-              <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
-              <AlertDialogAction onClick={transcribe}>
-                {t("transcribe")}
-              </AlertDialogAction>
-            </AlertDialogFooter>
-          </AlertDialogContent>
-        </AlertDialog>
      </div>

-      {transcription?.result ? (
-        <ScrollArea
-          ref={containerRef}
-          className="flex-1 px-2"
-          data-testid="media-transcription-result"
-        >
-          {transcription.result.map((t, index) => (
-            <div
-              key={index}
-              id={`segment-${index}`}
-              className={`py-1 px-2 mb-2 cursor-pointer hover:bg-yellow-400/25 ${
-                currentSegmentIndex === index ? "bg-yellow-400/25" : ""
-              }`}
-              onClick={() => {
-                onSelectSegment?.(index);
-              }}
-            >
-              <div className="flex items-center justify-between">
-                <span className="text-xs opacity-50">#{index + 1}</span>
-
-                <div className="flex items-center space-x-2">
-                  {(recordingStats || []).findIndex(
-                    (s) => s.referenceId === index
-                  ) !== -1 && <MicIcon className="w-3 h-3 text-sky-500" />}
-                  <span className="text-xs opacity-50">
-                    {t.timestamps.from.split(",")[0]}
-                  </span>
-                </div>
+      {(transcription.result as AlignmentResult).timeline.map(
+        (sentence, index) => (
+          <div
+            key={index}
+            id={`segment-${index}`}
+            className={`py-2 px-4 cursor-pointer hover:bg-yellow-400/10 ${
+              currentSegmentIndex === index ? "bg-yellow-400/25" : ""
+            }`}
+            onClick={() => {
+              wavesurfer.seekTo(
+                Math.floor((sentence.startTime / media.duration) * 1e8) / 1e8
+              );
+              wavesurfer.setScrollTime(sentence.startTime);
+              setCurrentSegmentIndex(index);
+            }}
+          >
+            <div className="flex items-center justify-between">
+              <span className="text-xs opacity-50">#{index + 1}</span>
+              <div className="flex items-center space-x-2">
+                {(recordingStats || []).findIndex(
+                  (s) => s.referenceId === index
+                ) !== -1 && <MicIcon className="w-3 h-3 text-sky-500" />}
+                <span className="text-xs opacity-50">
+                  {formatDuration(sentence.startTime, "s")}
+                </span>
              </div>
-              <p className="">{t.text}</p>
            </div>
-          ))}
-        </ScrollArea>
-      ) : (
-        <TranscriptionPlaceholder />
+            <p className="">{sentence.text}</p>
+          </div>
+        )
      )}
    </div>
  );
 };
-
-export const TranscriptionPlaceholder = () => {
-  return (
-    <div className="p-4">
-      {Array.from({ length: 5 }).map((_, i) => (
-        <Skeleton key={i} className="h-4 w-full mb-4" />
-      ))}
-      <Skeleton className="h-4 w-3/5" />
-    </div>
-  );
-};
--- a/enjoy/src/renderer/components/messages/assistant-message.tsx
+++ b/enjoy/src/renderer/components/messages/assistant-message.tsx
@@ -14,7 +14,7 @@ import {
 } from "@renderer/components/ui";
 import {
  SpeechPlayer,
-  AudioDetail,
+  AudioPlayer,
  ConversationShortcuts,
 } from "@renderer/components";
 import { useState, useEffect, useContext } from "react";
@@ -242,16 +242,16 @@ export const AssistantMessageComponent = (props: {
      <Sheet open={shadowing} onOpenChange={(value) => setShadowing(value)}>
        <SheetContent
          side="bottom"
-          className="rounded-t-2xl shadow-lg"
+          className="h-100vh p-0"
          displayClose={false}
        >
-          <SheetHeader className="flex items-center justify-center -mt-4 mb-2">
+          <SheetHeader className="flex items-center justify-center h-14">
            <SheetClose>
              <ChevronDownIcon />
            </SheetClose>
          </SheetHeader>

-          {Boolean(speech) && <AudioDetail md5={speech.md5} />}
+          {Boolean(speech) && <AudioPlayer md5={speech.md5} />}
        </SheetContent>
      </Sheet>
    </div>
--- a/enjoy/src/renderer/components/pitch-contour.tsx
+++ b/enjoy/src/renderer/components/pitch-contour.tsx
@@ -1,79 +0,0 @@
-import Pitchfinder from "pitchfinder";
-
-export const extractFrequencies = (props: {
-  peaks: Float32Array;
-  sampleRate: number;
-}): number[] => {
-  const { peaks, sampleRate } = props;
-
-  const detectPitch = Pitchfinder.AMDF({ sampleRate });
-  const duration = peaks.length / sampleRate;
-  const bpm = peaks.length / duration / 60;
-
-  const frequencies = Pitchfinder.frequencies(detectPitch, peaks, {
-    tempo: bpm,
-    quantization: bpm,
-  });
-
-  return frequencies;
-};
-
-export const PitchContour = (props: {
-  peaks?: Float32Array;
-  sampleRate?: number;
-  frequencies?: number[];
-  height: number;
-  id?: string;
-}) => {
-  const { peaks, sampleRate, height, id } = props;
-  let { frequencies } = props;
-
-  if (!frequencies) {
-    frequencies = extractFrequencies({ peaks, sampleRate });
-  }
-
-  // Find the baseline frequency (the value that appears most often)
-  const frequencyMap: any = {};
-  let maxAmount = 0;
-  let baseFrequency = 0;
-  frequencies.forEach((frequency) => {
-    if (!frequency) return;
-    const tolerance = 10;
-    frequency = Math.round(frequency * tolerance) / tolerance;
-    if (!frequencyMap[frequency]) frequencyMap[frequency] = 0;
-    frequencyMap[frequency] += 1;
-    if (frequencyMap[frequency] > maxAmount) {
-      maxAmount = frequencyMap[frequency];
-      baseFrequency = frequency;
-    }
-  });
-
-  const pitchUpColor = "#385587";
-  // const pitchDownColor = "#C26351";
-  const pitchDownColor = "#385587";
-
-  const canvas = document.createElement("canvas");
-  const ctx = canvas.getContext("2d");
-  canvas.width = frequencies.length;
-  canvas.height = height;
-  canvas.style.width = "100%";
-  canvas.style.height = "100%";
-
-  // Each frequency is a point whose Y position is the frequency and X position is the time
-  let prevY = 0;
-  frequencies.forEach((frequency, index) => {
-    if (!frequency) return;
-    const hratio = 0.5; // the bigger the narrower the pitch contour drawn on canvas.
-    const marginTop = height * 0.4; // the bigger the lower the pitch contour positioned.
-    const y =
-      Math.round(height - (frequency / (baseFrequency * 2)) * height) * hratio +
-      marginTop;
-    ctx.fillStyle = y > prevY ? pitchDownColor : pitchUpColor;
-    ctx.fillRect(index, y, 1, 2);
-    prevY = y;
-  });
-
-  canvas.id = id;
-
-  return canvas;
-};
--- a/enjoy/src/renderer/components/posts/post-audio.tsx
+++ b/enjoy/src/renderer/components/posts/post-audio.tsx
@@ -1,6 +1,7 @@
 import { useEffect, useState, useRef, useCallback, useContext } from "react";
 import { AppSettingsProviderContext } from "@renderer/context";
-import { PitchContour } from "@renderer/components";
+import { renderPitchContour } from "@renderer/lib/utils";
+import { extractFrequencies } from "@/utils";
 import WaveSurfer from "wavesurfer.js";
 import { Button, Skeleton } from "@renderer/components/ui";
 import { PlayIcon, PauseIcon } from "lucide-react";
@@ -12,6 +13,7 @@ import {
  defaultLayoutIcons,
 } from "@vidstack/react/player/layouts/default";
 export const STORAGE_WORKER_ENDPOINT = "https://enjoy-storage.baizhiheizi.com";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";

 export const PostAudio = (props: {
  audio: Partial<MediumType>;
@@ -22,11 +24,16 @@ export const PostAudio = (props: {
  const { webApi } = useContext(AppSettingsProviderContext);
  const [transcription, setTranscription] = useState<TranscriptionType>();

-  const currentTranscription = (transcription?.result || []).find(
-    (s) =>
-      currentTime >= s.offsets.from / 1000.0 &&
-      currentTime <= s.offsets.to / 1000.0
-  );
+  const currentTranscription = transcription?.result["transcript"]
+    ? (transcription.result?.timeline || []).find(
+        (s: TimelineEntry) =>
+          currentTime >= s.startTime && currentTime <= s.endTime
+      )
+    : (transcription?.result || []).find(
+        (s: TranscriptionResultSegmentType) =>
+          currentTime >= s.offsets.from / 1000.0 &&
+          currentTime <= s.offsets.to / 1000.0
+      );

  useEffect(() => {
    webApi
@@ -134,17 +141,25 @@ const WavesurferPlayer = (props: {
      wavesurfer.on("timeupdate", (time: number) => {
        setCurrentTime(time);
      }),
-      wavesurfer.on("decode", () => {
+      wavesurfer.on("ready", () => {
        setDuration(wavesurfer.getDuration());
        const peaks = wavesurfer.getDecodedData().getChannelData(0);
        const sampleRate = wavesurfer.options.sampleRate;
-        wavesurfer.renderer.getWrapper().appendChild(
-          PitchContour({
-            peaks,
-            sampleRate,
-            height,
-          })
-        );
+        const data = extractFrequencies({ peaks, sampleRate });
+        setTimeout(() => {
+          renderPitchContour({
+            wrapper: wavesurfer.getWrapper(),
+            canvasId: `pitch-contour-${audio.id}-canvas`,
+            labels: new Array(data.length).fill(""),
+            datasets: [
+              {
+                data,
+                cubicInterpolationMode: "monotone",
+                pointRadius: 1,
+              },
+            ],
+          });
+        }, 1000);
        setInitialized(true);
      }),
    ];
--- a/enjoy/src/renderer/components/posts/post-recording.tsx
+++ b/enjoy/src/renderer/components/posts/post-recording.tsx
@@ -1,5 +1,6 @@
 import { useEffect, useState, useRef, useCallback } from "react";
-import { PitchContour } from "@renderer/components";
+import { renderPitchContour } from "@renderer/lib/utils";
+import { extractFrequencies } from "@/utils";
 import WaveSurfer from "wavesurfer.js";
 import { Button, Skeleton } from "@renderer/components/ui";
 import { PlayIcon, PauseIcon } from "lucide-react";
@@ -59,17 +60,28 @@ export const PostRecording = (props: {
      wavesurfer.on("pause", () => {
        setIsPlaying(false);
      }),
-      wavesurfer.on("decode", () => {
+      wavesurfer.on("ready", () => {
        setDuration(wavesurfer.getDuration());
        const peaks = wavesurfer.getDecodedData().getChannelData(0);
        const sampleRate = wavesurfer.options.sampleRate;
-        wavesurfer.renderer.getWrapper().appendChild(
-          PitchContour({
-            peaks,
-            sampleRate,
-            height,
-          })
-        );
+        const data = extractFrequencies({ peaks, sampleRate });
+        setTimeout(() => {
+          renderPitchContour({
+            wrapper: wavesurfer.getWrapper(),
+            canvasId: `pitch-contour-${recording.id}-canvas`,
+            labels: new Array(data.length).fill(""),
+            datasets: [
+              {
+                data,
+                cubicInterpolationMode: "monotone",
+                pointRadius: 1,
+                borderColor: "#fb6f92",
+                pointBorderColor: "#fb6f92",
+                pointBackgroundColor: "#ff8fab",
+              },
+            ],
+          });
+        }, 1000);
        setInitialized(true);
      }),
    ];
@@ -119,15 +131,13 @@ export const PostRecording = (props: {
        ></div>
      </div>

-      {
-        recording.referenceText && (
-          <div className="mt-2 bg-muted px-4 py-2 rounded">
-            <div className="text-muted-foreground text-center font-serif">
-              {recording.referenceText}
-            </div>
+      {recording.referenceText && (
+        <div className="mt-2 bg-muted px-4 py-2 rounded">
+          <div className="text-muted-foreground text-center font-serif">
+            {recording.referenceText}
          </div>
-        )
-      }
+        </div>
+      )}
    </div>
  );
 };
--- a/enjoy/src/renderer/components/preferences/hotkeys.tsx
+++ b/enjoy/src/renderer/components/preferences/hotkeys.tsx
@@ -8,29 +8,84 @@ export const Hotkeys = () => {
    <>
      <div className="font-semibold mb-4 capitilized">{t("hotkeys")}</div>

-      <div className="flex items-center justify-between py-4">
-        <div className="flex items-center space-x-2">{t("quitApp")}</div>
-        <kbd className="bg-muted px-2 py-1 rounded-md text-sm text-muted-foreground">
-          {commandOrCtrl} + Q
-        </kbd>
-      </div>
-      <Separator />
+      <div className="mb-6">
+        <div className="text-sm text-muted-foreground">{t("system")}</div>

-      <div className="flex items-center justify-between py-4">
-        <div className="flex items-center space-x-2">{t("openPreferences")}</div>
-        <kbd className="bg-muted px-2 py-1 rounded-md text-sm text-muted-foreground">
-          {commandOrCtrl} + ,
-        </kbd>
-      </div>
-      <Separator />
+        <div className="flex items-center justify-between py-4">
+          <div className="flex items-center space-x-2">{t("quitApp")}</div>
+          <kbd className="bg-muted px-2 py-1 rounded-md text-sm text-muted-foreground">
+            {commandOrCtrl} + Q
+          </kbd>
+        </div>

-      <div className="flex items-center justify-between py-4">
-        <div className="flex items-center space-x-2">{t("playOrPause")}</div>
-        <kbd className="bg-muted px-2 py-1 rounded-md text-sm text-muted-foreground">
-          Space
-        </kbd>
+        <Separator />
+
+        <div className="flex items-center justify-between py-4">
+          <div className="flex items-center space-x-2">
+            {t("openPreferences")}
+          </div>
+          <kbd className="bg-muted px-2 py-1 rounded-md text-sm text-muted-foreground">
+            {commandOrCtrl} + ,
+          </kbd>
+        </div>
+        <Separator />
+      </div>
+
+      <div className="mb-6">
+        <div className="text-sm text-muted-foreground">{t("player")}</div>
+
+        <div className="flex items-center justify-between py-4">
+          <div className="flex items-center space-x-2">{t("playOrPause")}</div>
+          <kbd className="bg-muted px-2 py-1 rounded-md text-sm text-muted-foreground">
+            Space
+          </kbd>
+        </div>
+
+        <Separator />
+
+        <div className="flex items-center justify-between py-4">
+          <div className="flex items-center space-x-2 capitalize">
+            {t("startOrStopRecording")}
+          </div>
+          <kbd className="bg-muted px-2 py-1 rounded-md text-sm text-muted-foreground">
+            r
+          </kbd>
+        </div>
+
+        <Separator />
+
+        <div className="flex items-center justify-between py-4">
+          <div className="flex items-center space-x-2">
+            {t("playOrPauseRecording")}
+          </div>
+          <kbd className="bg-muted px-2 py-1 rounded-md text-sm text-muted-foreground">
+            {commandOrCtrl} + r
+          </kbd>
+        </div>
+
+        <Separator />
+
+        <div className="flex items-center justify-between py-4">
+          <div className="flex items-center space-x-2 capitalize">
+            {t("playPreviousSegment")}
+          </div>
+          <kbd className="bg-muted px-2 py-1 rounded-md text-sm text-muted-foreground">
+            p
+          </kbd>
+        </div>
+
+        <Separator />
+
+        <div className="flex items-center justify-between py-4">
+          <div className="flex items-center space-x-2 capitalize">
+            {t("playNextSegment")}
+          </div>
+          <kbd className="bg-muted px-2 py-1 rounded-md text-sm text-muted-foreground">
+            n
+          </kbd>
+        </div>
+        <Separator />
      </div>
-      <Separator />
    </>
  );
 };
--- a/enjoy/src/renderer/components/preferences/openai-settings.tsx
+++ b/enjoy/src/renderer/components/preferences/openai-settings.tsx
@@ -117,7 +117,6 @@ export const OpenaiSettings = () => {
                      <Input
                        disabled={!editing}
                        placeholder={t("leaveEmptyToUseDefault")}
-                        defaultValue=""
                        value={field.value}
                        onChange={field.onChange}
                      />
--- a/enjoy/src/renderer/components/recordings/recording-player.tsx
+++ b/enjoy/src/renderer/components/recordings/recording-player.tsx
@@ -1,6 +1,7 @@
 import { useEffect, useState, useRef, useCallback } from "react";
 import WaveSurfer from "wavesurfer.js";
-import { PitchContour } from "@renderer/components";
+import { renderPitchContour } from "@renderer/lib/utils";
+import { extractFrequencies } from "@/utils";
 import { Button, Skeleton } from "@renderer/components/ui";
 import { PlayIcon, PauseIcon } from "lucide-react";
 import { useIntersectionObserver } from "@uidotdev/usehooks";
@@ -70,16 +71,23 @@ export const RecordingPlayer = (props: {
      wavesurfer.on("timeupdate", (time: number) => {
        onCurrentTimeChange?.(time);
      }),
-      wavesurfer.on("decode", () => {
+      wavesurfer.on("ready", () => {
        const peaks = wavesurfer.getDecodedData().getChannelData(0);
        const sampleRate = wavesurfer.options.sampleRate;
-        wavesurfer.renderer.getWrapper().appendChild(
-          PitchContour({
-            peaks,
-            sampleRate,
-            height,
-          })
-        );
+        const data = extractFrequencies({ peaks, sampleRate });
+        setTimeout(() => {
+          renderPitchContour({
+            wrapper: wavesurfer.getWrapper(),
+            canvasId: `pitch-contour-${recording.id}-canvas`,
+            labels: new Array(data.length).fill(""),
+            datasets: [
+              {
+                data,
+                cubicInterpolationMode: "monotone",
+              },
+            ],
+          });
+        }, 1000);
        setInitialized(true);
      }),
    ];
--- a/enjoy/src/renderer/components/videos/index.ts
+++ b/enjoy/src/renderer/components/videos/index.ts
@@ -1,6 +1,6 @@
 export * from "./videos-table";
 export * from "./video-edit-form";
-export * from "./video-detail";
+export * from "./video-player";

 export * from "./videos-component";

--- a/enjoy/src/renderer/components/videos/video-detail.tsx
+++ b/enjoy/src/renderer/components/videos/video-detail.tsx
@@ -1,407 +0,0 @@
-import { useEffect, useState, useContext } from "react";
-import {
-  DbProviderContext,
-  AppSettingsProviderContext,
-  AISettingsProviderContext,
-} from "@renderer/context";
-import {
-  LoaderSpin,
-  RecordingsList,
-  PagePlaceholder,
-  MediaPlayer,
-  MediaTranscription,
-} from "@renderer/components";
-import { CheckCircleIcon, LoaderIcon } from "lucide-react";
-import {
-  AlertDialog,
-  AlertDialogHeader,
-  AlertDialogDescription,
-  AlertDialogTitle,
-  AlertDialogContent,
-  AlertDialogFooter,
-  AlertDialogCancel,
-  Button,
-  PingPoint,
-  Progress,
-  ScrollArea,
-  toast,
-} from "@renderer/components/ui";
-import { t } from "i18next";
-import { useTranscribe } from "@renderer/hooks";
-import { useNavigate } from "react-router-dom";
-
-export const VideoDetail = (props: { id?: string; md5?: string }) => {
-  const navigate = useNavigate();
-
-  const { id, md5 } = props;
-  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
-  const { whisperConfig } = useContext(AISettingsProviderContext);
-  const { EnjoyApp, webApi } = useContext(AppSettingsProviderContext);
-
-  const [video, setVideo] = useState<VideoType | null>(null);
-  const [transcription, setTranscription] = useState<TranscriptionType>(null);
-  const [sharing, setSharing] = useState<boolean>(false);
-
-  // Transcription controls
-  const [transcribing, setTranscribing] = useState<boolean>(false);
-  const { transcribe } = useTranscribe();
-  const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
-
-  // Player controls
-  const [initialized, setInitialized] = useState<boolean>(false);
-  const [currentTime, setCurrentTime] = useState<number>(0);
-  const [seek, setSeek] = useState<{
-    seekTo: number;
-    timestamp: number;
-  }>();
-  const [currentSegmentIndex, setCurrentSegmentIndex] = useState<number>(0);
-  const [recordButtonVisible, setRecordButtonVisible] =
-    useState<boolean>(false);
-  const [zoomRatio, setZoomRatio] = useState<number>(1.0);
-  const [isPlaying, setIsPlaying] = useState(false);
-  const [playMode, setPlayMode] = useState<"loop" | "single" | "all">("all");
-  const [playBackRate, setPlaybackRate] = useState<number>(1);
-  const [displayInlineCaption, setDisplayInlineCaption] =
-    useState<boolean>(true);
-
-  const onTransactionUpdate = (event: CustomEvent) => {
-    const { model, action, record } = event.detail || {};
-    if (model === "Transcription" && action === "update") {
-      setTranscription(record);
-    }
-  };
-
-  const findOrCreateTranscription = async () => {
-    return EnjoyApp.transcriptions
-      .findOrCreate({
-        targetId: video.id,
-        targetType: "Video",
-      })
-      .then((transcription) => {
-        setTranscription(transcription);
-      });
-  };
-
-  const generateTranscription = async () => {
-    if (transcribing) return;
-    if (!transcription) {
-      await findOrCreateTranscription();
-    }
-
-    setTranscribing(true);
-    setTranscribingProgress(0);
-    try {
-      const { engine, model, result } = await transcribe(video.src, {
-        targetId: video.id,
-        targetType: "Video",
-      });
-      await EnjoyApp.transcriptions.update(transcription.id, {
-        state: "finished",
-        result,
-        engine,
-        model,
-      });
-    } catch (err) {
-      toast.error(err.message);
-    }
-
-    setTranscribing(false);
-  };
-
-  const findTranscriptionFromWebApi = async () => {
-    if (!transcription) {
-      await findOrCreateTranscription();
-    }
-
-    const res = await webApi.transcriptions({
-      targetMd5: video.md5,
-    });
-
-    const transcript = (res?.transcriptions || []).filter((t) =>
-      ["base", "small", "medium", "large", "whisper-1"].includes(t.model)
-    )?.[0];
-
-    if (!transcript) {
-      throw new Error("Transcription not found");
-    }
-
-    await EnjoyApp.transcriptions.update(transcription.id, {
-      state: "finished",
-      result: transcript.result,
-      engine: transcript.engine,
-      model: transcript.model,
-    });
-  };
-
-  const findOrGenerateTranscription = async () => {
-    try {
-      await findTranscriptionFromWebApi();
-    } catch (err) {
-      console.error(err);
-      await generateTranscription();
-    }
-  };
-
-  const handleShare = async () => {
-    if (!video.source.startsWith("http")) {
-      toast.error(t("shareFailed"), {
-        description: t("cannotShareLocalVideo"),
-      });
-      return;
-    }
-
-    if (!video.source && !video.isUploaded) {
-      try {
-        await EnjoyApp.videos.upload(video.id);
-      } catch (err) {
-        toast.error(t("shareFailed"), { description: err.message });
-        return;
-      }
-    }
-
-    webApi
-      .createPost({
-        targetType: "Video",
-        targetId: video.id,
-      })
-      .then(() => {
-        toast.success(t("sharedSuccessfully"), {
-          description: t("sharedVideo"),
-        });
-      })
-      .catch((err) => {
-        toast.error(t("shareFailed"), { description: err.message });
-      });
-    setSharing(false);
-  };
-
-  useEffect(() => {
-    const where = id ? { id } : { md5 };
-    EnjoyApp.videos.findOne(where).then((video) => {
-      if (video) {
-        setVideo(video);
-      } else {
-        toast.error(t("models.video.notFound"));
-      }
-    });
-  }, [id, md5]);
-
-  useEffect(() => {
-    if (!video) return;
-
-    findOrCreateTranscription();
-  }, [video]);
-
-  useEffect(() => {
-    if (!initialized) return;
-    if (!transcription) return;
-
-    addDblistener(onTransactionUpdate);
-
-    if (transcription?.state == "pending") {
-      findOrGenerateTranscription();
-    }
-
-    if (whisperConfig.service === "local") {
-      EnjoyApp.whisper.onProgress((_, p: number) => {
-        if (p > 100) p = 100;
-        setTranscribingProgress(p);
-      });
-    }
-
-    return () => {
-      removeDbListener(onTransactionUpdate);
-      EnjoyApp.whisper.removeProgressListeners();
-    };
-  }, [md5, transcription, initialized]);
-
-  if (!video) {
-    return <LoaderSpin />;
-  }
-
-  if (!video.src) {
-    return (
-      <PagePlaceholder placeholder="invalid" extra="cannot find play source" />
-    );
-  }
-
-  return (
-    <div className="relative">
-      <div className={`grid grid-cols-7 gap-4 ${initialized ? "" : "blur-sm"}`}>
-        <div className="col-span-5 h-[calc(100vh-6.5rem)] flex flex-col">
-          <MediaPlayer
-            mediaId={video.id}
-            mediaType="Video"
-            mediaUrl={video.src}
-            mediaMd5={video.md5}
-            transcription={transcription}
-            currentTime={currentTime}
-            setCurrentTime={setCurrentTime}
-            currentSegmentIndex={currentSegmentIndex}
-            setCurrentSegmentIndex={setCurrentSegmentIndex}
-            recordButtonVisible={recordButtonVisible}
-            setRecordButtonVisible={setRecordButtonVisible}
-            seek={seek}
-            initialized={initialized}
-            setInitialized={setInitialized}
-            zoomRatio={zoomRatio}
-            setZoomRatio={setZoomRatio}
-            isPlaying={isPlaying}
-            setIsPlaying={setIsPlaying}
-            playMode={playMode}
-            setPlayMode={setPlayMode}
-            playBackRate={playBackRate}
-            setPlaybackRate={setPlaybackRate}
-            displayInlineCaption={displayInlineCaption}
-            setDisplayInlineCaption={setDisplayInlineCaption}
-            onShare={() => setSharing(true)}
-            onDecoded={({ duration, sampleRate }) => {
-              if (video.duration) return;
-
-              EnjoyApp.videos.update(video.id, {
-                metadata: Object.assign({}, video.metadata, {
-                  duration,
-                  sampleRate,
-                }),
-              });
-            }}
-          />
-
-          <ScrollArea
-            className={`flex-1 relative ${
-              recordButtonVisible ? "bg-muted" : "hidden"
-            }`}
-          >
-            <RecordingsList
-              key={`recordings-list-${video.id}-${currentSegmentIndex}`}
-              targetId={video.id}
-              targetType="Video"
-              referenceText={transcription?.result?.[currentSegmentIndex]?.text}
-              referenceId={currentSegmentIndex}
-            />
-          </ScrollArea>
-        </div>
-
-        <div className="col-span-2 h-[calc(100vh-6.5rem)]">
-          <MediaTranscription
-            mediaId={video.id}
-            mediaType="Video"
-            mediaName={video.name}
-            transcription={transcription}
-            transcribing={transcribing}
-            progress={transcribingProgress}
-            transcribe={generateTranscription}
-            currentSegmentIndex={currentSegmentIndex}
-            onSelectSegment={(index) => {
-              if (currentSegmentIndex === index) return;
-
-              const segment = transcription?.result?.[index];
-              if (!segment) return;
-
-              if (playMode === "loop" && isPlaying) {
-                setIsPlaying(false);
-              }
-              setSeek({
-                seekTo: segment.offsets.from / 1000,
-                timestamp: Date.now(),
-              });
-            }}
-          />
-        </div>
-      </div>
-
-      <AlertDialog open={sharing} onOpenChange={(value) => setSharing(value)}>
-        <AlertDialogContent>
-          <AlertDialogHeader>
-            <AlertDialogTitle>{t("shareAudio")}</AlertDialogTitle>
-            <AlertDialogDescription>
-              {t("areYouSureToShareThisAudioToCommunity")}
-            </AlertDialogDescription>
-          </AlertDialogHeader>
-          <AlertDialogFooter>
-            <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
-            <Button variant="default" onClick={handleShare}>
-              {t("share")}
-            </Button>
-          </AlertDialogFooter>
-        </AlertDialogContent>
-      </AlertDialog>
-
-      {/* Show loading progress until waveform is decoded & transcribed */}
-      <AlertDialog open={!initialized || !Boolean(transcription?.result)}>
-        <AlertDialogContent>
-          <AlertDialogHeader>
-            <AlertDialogTitle>{t("preparingVideo")}</AlertDialogTitle>
-            <AlertDialogDescription>
-              {t("itMayTakeAWhileToPrepareForTheFirstLoad")}
-            </AlertDialogDescription>
-          </AlertDialogHeader>
-
-          <div className="py-4">
-            {initialized ? (
-              <div className="mb-4 flex items-center space-x-4">
-                <CheckCircleIcon className="w-4 h-4 text-green-500" />
-                <span>{t("waveformIsDecoded")}</span>
-              </div>
-            ) : (
-              <div className="mb-4 flex items-center space-x-4">
-                <LoaderIcon className="w-4 h-4 animate-spin" />
-                <span>{t("decodingWaveform")}</span>
-              </div>
-            )}
-
-            {!transcription ? (
-              <div className="flex items-center space-x-4">
-                <LoaderIcon className="w-4 h-4 animate-spin" />
-                <span>{t("loadingTranscription")}</span>
-              </div>
-            ) : transcription.result ? (
-              <div className="flex items-center space-x-4">
-                <CheckCircleIcon className="w-4 h-4 text-green-500" />
-                <span>{t("transcribedSuccessfully")}</span>
-              </div>
-            ) : transcribing ? (
-              <div className="">
-                <div className="flex items-center space-x-4 mb-2">
-                  <PingPoint colorClassName="bg-yellow-500" />
-                  <span>{t("transcribing")}</span>
-                </div>
-                {whisperConfig.service === "local" && (
-                  <Progress value={transcribingProgress} />
-                )}
-              </div>
-            ) : (
-              <div className="flex items-center space-x-4">
-                <PingPoint colorClassName="bg-muted" />
-                <div className="inline">
-                  <span>{t("notTranscribedYet")}</span>
-                  {initialized && (
-                    <Button
-                      onClick={generateTranscription}
-                      className="ml-4"
-                      size="sm"
-                    >
-                      {t("transcribe")}
-                    </Button>
-                  )}
-                </div>
-              </div>
-            )}
-          </div>
-
-          <AlertDialogFooter>
-            <Button variant="secondary" onClick={() => navigate(-1)}>
-              {t("cancel")}
-            </Button>
-          </AlertDialogFooter>
-        </AlertDialogContent>
-      </AlertDialog>
-
-      {!initialized && (
-        <div className="top-0 w-full h-full absolute z-30 bg-background/10 flex items-center justify-center">
-          <LoaderIcon className="text-muted-foreground animate-spin w-8 h-8" />
-        </div>
-      )}
-    </div>
-  );
-};
--- a/enjoy/src/renderer/components/videos/video-player.tsx
+++ b/enjoy/src/renderer/components/videos/video-player.tsx
@@ -0,0 +1,72 @@
+import { useEffect, useContext, useRef } from "react";
+import { MediaPlayerProviderContext } from "@renderer/context";
+import {
+  MediaLoadingModal,
+  MediaCaption,
+  MediaPlayerControls,
+  MediaTabs,
+  MediaCurrentRecording,
+} from "@renderer/components";
+import { formatDuration } from "@renderer/lib/utils";
+import { useVideo } from "@renderer/hooks";
+
+export const VideoPlayer = (props: { id?: string; md5?: string }) => {
+  const { id, md5 } = props;
+  const { media, currentTime, setMedia, setRef } = useContext(
+    MediaPlayerProviderContext
+  );
+  const { video } = useVideo({ id, md5 });
+  const ref = useRef(null);
+
+  useEffect(() => {
+    if (!video) return;
+
+    setMedia(video);
+  }, [video]);
+
+  useEffect(() => {
+    setRef(ref);
+  }, [ref]);
+
+  return (
+    <div data-testid="video-player">
+      <div className="h-[calc(100vh-37.5rem)] mb-4">
+        <div className="grid grid-cols-3 gap-4 px-6 h-full">
+          <div className="col-span-1 rounded-lg border shadow-lg h-[calc(100vh-37.5rem)]">
+            <MediaTabs />
+          </div>
+          <div className="col-span-2 h-[calc(100vh-37.5rem)]">
+            <MediaCaption />
+          </div>
+        </div>
+      </div>
+
+      <div className="h-[33rem] flex flex-col">
+        <div className="h-[13rem] py-2 px-6 mb-4">
+          <MediaCurrentRecording />
+        </div>
+
+        <div className="w-full h-[13rem] px-6 py-2 mb-4">
+          <div className="border rounded-xl shadow-lg relative">
+            <div data-testid="media-player-container" ref={ref} />
+            <div className="absolute right-2 top-1">
+              <span className="text-sm">
+                {formatDuration(currentTime || 0)}
+              </span>
+              <span className="mx-1">/</span>
+              <span className="text-sm">
+                {formatDuration(media?.duration || 0)}
+              </span>
+            </div>
+          </div>
+        </div>
+
+        <div className="w-full bg-background z-10 shadow-xl">
+          <MediaPlayerControls />
+        </div>
+      </div>
+
+      <MediaLoadingModal />
+    </div>
+  );
+};
--- a/enjoy/src/renderer/components/videos/videos-component.tsx
+++ b/enjoy/src/renderer/components/videos/videos-component.tsx
@@ -239,7 +239,7 @@ export const VideosComponent = () => {
            <AlertDialogTitle>{t("transcribe")}</AlertDialogTitle>
            <AlertDialogDescription>
              <p className="break-all">
-                {t("transcribeVideoConfirmation", {
+                {t("transcribeMediaConfirmation", {
                  name: transcribing?.name || "",
                })}
              </p>
--- a/enjoy/src/renderer/context/ai-settings-provider.tsx
+++ b/enjoy/src/renderer/context/ai-settings-provider.tsx
@@ -25,7 +25,7 @@ export const AISettingsProvider = ({
 }: {
  children: React.ReactNode;
 }) => {
-  const [defaultEngine, setDefaultEngine] = useState<string>(null);
+  const [defaultEngine, setDefaultEngine] = useState<string>("openai");
  const [openai, setOpenai] = useState<LlmProviderType>(null);
  const [googleGenerativeAi, setGoogleGenerativeAi] =
    useState<LlmProviderType>(null);
--- a/enjoy/src/renderer/context/index.ts
+++ b/enjoy/src/renderer/context/index.ts
@@ -2,3 +2,5 @@ export * from "./ai-settings-provider";
 export * from "./app-settings-provider";
 export * from "./db-provider";
 export * from "./theme-provider";
+export * from "./wavesurfer-provider";
+export * from "./media-player-provider";
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -0,0 +1,454 @@
+import { createContext, useEffect, useState, useContext } from "react";
+import { extractFrequencies } from "@/utils";
+import { AppSettingsProviderContext } from "@renderer/context";
+import { useTranscriptions, useRecordings } from "@renderer/hooks";
+import WaveSurfer from "wavesurfer.js";
+import Regions, {
+  type Region as RegionType,
+} from "wavesurfer.js/dist/plugins/regions";
+import Chart from "chart.js/auto";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
+import { IPA_MAPPING } from "@/constants";
+
+type MediaPlayerContextType = {
+  media: AudioType | VideoType;
+  setMedia: (media: AudioType | VideoType) => void;
+  setMediaProvider: (mediaProvider: HTMLAudioElement | null) => void;
+  waveform: WaveFormDataType;
+  // wavesurfer
+  wavesurfer: WaveSurfer;
+  setRef: (ref: any) => void;
+  decoded: boolean;
+  // player state
+  currentTime: number;
+  currentSegmentIndex: number;
+  setCurrentSegmentIndex: (index: number) => void;
+  zoomRatio: number;
+  setZoomRatio: (zoomRation: number) => void;
+  fitZoomRatio: number;
+  minPxPerSec: number;
+  // regions
+  regions: Regions | null;
+  activeRegion: RegionType;
+  setActiveRegion: (region: RegionType) => void;
+  editingRegion: boolean;
+  setEditingRegion: (editing: boolean) => void;
+  renderPitchContour: (
+    region: RegionType,
+    options?: {
+      repaint?: boolean;
+      canvasId?: string;
+      containerClassNames?: string[];
+      data?: Chart["data"];
+    }
+  ) => void;
+  pitchChart: Chart;
+  // Transcription
+  transcription: TranscriptionType;
+  generateTranscription: () => void;
+  transcribing: boolean;
+  transcribingProgress: number;
+  transcriptionDraft: TranscriptionType["result"];
+  setTranscriptionDraft: (result: TranscriptionType["result"]) => void;
+  // Recordings
+  isRecording: boolean;
+  setIsRecording: (isRecording: boolean) => void;
+  currentRecording: RecordingType;
+  setCurrentRecording: (recording: RecordingType) => void;
+  recordings: RecordingType[];
+  fetchRecordings: (offset: number) => void;
+  loadingRecordings: boolean;
+  hasMoreRecordings: boolean;
+};
+
+export const MediaPlayerProviderContext =
+  createContext<MediaPlayerContextType>(null);
+
+export const MediaPlayerProvider = ({
+  children,
+}: {
+  children: React.ReactNode;
+}) => {
+  const height = 192;
+  const minPxPerSec = 150;
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+
+  const [media, setMedia] = useState<AudioType | VideoType>(null);
+  const [mediaProvider, setMediaProvider] = useState<HTMLAudioElement | null>(
+    null
+  );
+  const [waveform, setWaveForm] = useState<WaveFormDataType>(null);
+  const [wavesurfer, setWavesurfer] = useState(null);
+
+  const [regions, setRegions] = useState<Regions | null>(null);
+  const [activeRegion, setActiveRegion] = useState<RegionType>(null);
+  const [editingRegion, setEditingRegion] = useState<boolean>(false);
+  const [pitchChart, setPitchChart] = useState<Chart>(null);
+
+  const [ref, setRef] = useState(null);
+
+  // Player state
+  const [decoded, setDecoded] = useState<boolean>(false);
+  const [currentTime, setCurrentTime] = useState<number>(0);
+  const [currentSegmentIndex, setCurrentSegmentIndex] = useState<number>(0);
+  const [fitZoomRatio, setFitZoomRatio] = useState<number>(1.0);
+  const [zoomRatio, setZoomRatio] = useState<number>(1.0);
+
+  const [isRecording, setIsRecording] = useState<boolean>(false);
+  const [currentRecording, setCurrentRecording] = useState<RecordingType>(null);
+
+  const [transcriptionDraft, setTranscriptionDraft] =
+    useState<TranscriptionType["result"]>();
+
+  const {
+    transcription,
+    generateTranscription,
+    transcribing,
+    transcribingProgress,
+  } = useTranscriptions(media);
+
+  const {
+    recordings,
+    fetchRecordings,
+    loading: loadingRecordings,
+    hasMore: hasMoreRecordings,
+  } = useRecordings(media, currentSegmentIndex);
+
+  const initializeWavesurfer = async () => {
+    if (!media) return;
+    if (!mediaProvider) return;
+    if (!ref.current) return;
+
+    const ws = WaveSurfer.create({
+      container: ref.current,
+      height,
+      waveColor: "#eaeaea",
+      progressColor: "#c0d6df",
+      cursorColor: "#ff0054",
+      barWidth: 2,
+      autoScroll: true,
+      minPxPerSec,
+      autoCenter: false,
+      dragToSeek: false,
+      fillParent: true,
+      media: mediaProvider,
+      peaks: waveform ? [waveform.peaks] : undefined,
+      duration: waveform ? waveform.duration : undefined,
+    });
+
+    const blob = await fetch(media.src).then((res) => res.blob());
+
+    if (waveform) {
+      ws.loadBlob(blob, [waveform.peaks], waveform.duration);
+      setDecoded(true);
+    } else {
+      ws.loadBlob(blob);
+    }
+
+    setWavesurfer(ws);
+  };
+
+  const renderPitchContour = (
+    region: RegionType,
+    options?: {
+      repaint?: boolean;
+      canvasId?: string;
+      containerClassNames?: string[];
+      data?: Chart["data"];
+    }
+  ) => {
+    if (!region) return;
+    if (!waveform?.frequencies?.length) return;
+    if (!wavesurfer) return;
+
+    const { repaint = true, containerClassNames = [] } = options || {};
+    const duration = wavesurfer.getDuration();
+    const fromIndex = Math.round(
+      (region.start / duration) * waveform.frequencies.length
+    );
+    const toIndex = Math.round(
+      (region.end / duration) * waveform.frequencies.length
+    );
+
+    const wrapper = (wavesurfer as any).renderer.getWrapper();
+    // remove existing pitch contour
+    if (repaint) {
+      wrapper
+        .querySelectorAll(".pitch-contour")
+        .forEach((element: HTMLDivElement) => {
+          element.remove();
+        });
+    }
+
+    // calculate offset and width
+    const wrapperWidth = wrapper.getBoundingClientRect().width;
+    const offsetLeft = (region.start / duration) * wrapperWidth;
+    const width = ((region.end - region.start) / duration) * wrapperWidth;
+
+    // create container and canvas
+    const pitchContourWidthContainer = document.createElement("div");
+    const canvas = document.createElement("canvas");
+    const canvasId = options?.canvasId || `pitch-contour-${region.id}-canvas`;
+    canvas.id = canvasId;
+    canvas.style.width = `${width}px`;
+    canvas.style.height = `${height}px`;
+    pitchContourWidthContainer.appendChild(canvas);
+
+    pitchContourWidthContainer.style.position = "absolute";
+    pitchContourWidthContainer.style.top = "0";
+    pitchContourWidthContainer.style.left = "0";
+
+    pitchContourWidthContainer.style.width = `${width}px`;
+    pitchContourWidthContainer.style.height = `${height}px`;
+    pitchContourWidthContainer.style.marginLeft = `${offsetLeft}px`;
+    pitchContourWidthContainer.classList.add(
+      "pitch-contour",
+      ...containerClassNames
+    );
+    // pitchContourWidthContainer.style.zIndex = "3";
+
+    wrapper.appendChild(pitchContourWidthContainer);
+
+    // prepare chart data
+    let chartData: Chart["data"] = options?.data;
+
+    if (!chartData) {
+      const data = waveform.frequencies.slice(fromIndex, toIndex);
+      const regionDuration = region.end - region.start;
+
+      const labels = new Array(data.length).fill("");
+      const caption = transcription?.result?.timeline?.[currentSegmentIndex];
+      if (region.id.startsWith("segment-region")) {
+        caption.timeline.forEach((segment: TimelineEntry) => {
+          const index = Math.round(
+            ((segment.startTime - region.start) / regionDuration) * data.length
+          );
+          labels[index] = segment.text.trim();
+        });
+      } else if (region.id.startsWith("word-region")) {
+        const words = caption.timeline.filter(
+          (w: TimelineEntry) =>
+            w.startTime >= region.start &&
+            w.endTime <= region.end &&
+            w.type === "word"
+        );
+
+        let phones: TimelineEntry[] = [];
+        words.forEach((word: TimelineEntry) => {
+          word.timeline.forEach((token: TimelineEntry) => {
+            phones = phones.concat(token.timeline);
+          });
+        });
+
+        phones.forEach((phone: TimelineEntry) => {
+          const index = Math.round(
+            ((phone.startTime - region.start) / regionDuration) * data.length
+          );
+          labels[index] = [
+            labels[index] || "",
+            (IPA_MAPPING as any)[phone.text.trim()] || phone.text.trim(),
+          ].join("");
+        });
+      }
+
+      chartData = {
+        labels,
+        datasets: [
+          {
+            data,
+            cubicInterpolationMode: "monotone",
+          },
+        ],
+      };
+    }
+
+    setPitchChart(
+      new Chart(canvas, {
+        type: "line",
+        data: chartData,
+        options: {
+          plugins: {
+            legend: {
+              display: false,
+            },
+            title: {
+              display: false,
+            },
+          },
+          scales: {
+            x: {
+              beginAtZero: true,
+              ticks: {
+                autoSkip: false,
+              },
+              display: true,
+              grid: {
+                display: false,
+              },
+              border: {
+                display: false,
+              },
+            },
+            y: {
+              beginAtZero: true,
+              display: false,
+            },
+          },
+        },
+      })
+    );
+  };
+
+  useEffect(() => {
+    if (!media) return;
+
+    EnjoyApp.waveforms.find(media.md5).then((waveform) => {
+      setWaveForm(waveform);
+    });
+  }, [media]);
+
+  /*
+   * Initialize wavesurfer when container ref is available
+   * and mediaProvider is available
+   */
+  useEffect(() => {
+    initializeWavesurfer();
+  }, [media, ref, mediaProvider]);
+
+  /*
+   * When wavesurfer is decoded,
+   * set up event listeners for wavesurfer
+   * and clean up when component is unmounted
+   */
+  useEffect(() => {
+    if (!wavesurfer) return;
+
+    setRegions(wavesurfer.registerPlugin(Regions.create()));
+
+    setCurrentTime(0);
+
+    const subscriptions = [
+      wavesurfer.on("loading", (percent: number) => console.log(`${percent}%`)),
+      wavesurfer.on("timeupdate", (time: number) => setCurrentTime(time)),
+      wavesurfer.on("decode", () => {
+        const peaks: Float32Array = wavesurfer
+          .getDecodedData()
+          .getChannelData(0);
+        const duration: number = wavesurfer.getDuration();
+        const sampleRate = wavesurfer.options.sampleRate;
+        const _frequencies = extractFrequencies({ peaks, sampleRate });
+        const _waveform = {
+          peaks: Array.from(peaks),
+          duration,
+          sampleRate,
+          frequencies: _frequencies,
+        };
+        EnjoyApp.waveforms.save(media.md5, _waveform);
+        setWaveForm(_waveform);
+      }),
+      wavesurfer.on("ready", () => {
+        setDecoded(true);
+      }),
+    ];
+
+    return () => {
+      subscriptions.forEach((unsub) => unsub());
+    };
+  }, [wavesurfer]);
+
+  /*
+   * update fitZoomRatio when currentSegmentIndex is updated
+   */
+  useEffect(() => {
+    if (!ref?.current) return;
+    if (!wavesurfer) return;
+
+    if (!activeRegion) return;
+
+    const containerWidth = ref.current.getBoundingClientRect().width;
+    const duration = activeRegion.end - activeRegion.start;
+    if (activeRegion.id.startsWith("segment-region")) {
+      setFitZoomRatio(containerWidth / duration / minPxPerSec);
+    } else if (activeRegion.id.startsWith("word-region")) {
+      setFitZoomRatio(containerWidth / 3 / duration / minPxPerSec);
+    }
+  }, [ref, wavesurfer, activeRegion]);
+
+  /*
+   * Zoom chart when zoomRatio update
+   */
+  useEffect(() => {
+    if (!wavesurfer) return;
+    if (!decoded) return;
+
+    wavesurfer.zoom(zoomRatio * minPxPerSec);
+    if (!activeRegion) return;
+
+    renderPitchContour(activeRegion);
+    wavesurfer.setScrollTime(activeRegion.start);
+  }, [zoomRatio, wavesurfer, decoded]);
+
+  /*
+   * Re-render pitch contour when active region changed
+   */
+  useEffect(() => {
+    if (!activeRegion) return;
+
+    renderPitchContour(activeRegion);
+  }, [activeRegion]);
+
+  /*
+   * Update player styles
+   */
+  useEffect(() => {
+    if (!wavesurfer) return;
+    if (!decoded) return;
+
+    const scrollContainer = wavesurfer.getWrapper().closest(".scroll");
+    scrollContainer.style.scrollbarWidth = "thin";
+  }, [decoded, wavesurfer]);
+
+  return (
+    <MediaPlayerProviderContext.Provider
+      value={{
+        media,
+        setMedia,
+        setMediaProvider,
+        wavesurfer,
+        setRef,
+        decoded,
+        currentTime,
+        currentSegmentIndex,
+        setCurrentSegmentIndex,
+        waveform,
+        zoomRatio,
+        setZoomRatio,
+        fitZoomRatio,
+        minPxPerSec,
+        transcription,
+        regions,
+        renderPitchContour,
+        pitchChart,
+        activeRegion,
+        setActiveRegion,
+        editingRegion,
+        setEditingRegion,
+        generateTranscription,
+        transcribing,
+        transcribingProgress,
+        transcriptionDraft,
+        setTranscriptionDraft,
+        isRecording,
+        setIsRecording,
+        currentRecording,
+        setCurrentRecording,
+        recordings,
+        fetchRecordings,
+        loadingRecordings,
+        hasMoreRecordings,
+      }}
+    >
+      {children}
+    </MediaPlayerProviderContext.Provider>
+  );
+};
--- a/enjoy/src/renderer/context/wavesurfer-provider.tsx
+++ b/enjoy/src/renderer/context/wavesurfer-provider.tsx
@@ -0,0 +1,185 @@
+import { createContext, useEffect, useState, useContext } from "react";
+import { extractFrequencies } from "@/utils";
+import { AppSettingsProviderContext } from "@renderer/context";
+import WaveSurfer from "wavesurfer.js";
+import Regions, {
+  type Region as RegionType,
+} from "wavesurfer.js/dist/plugins/regions";
+
+type WavesurferContextType = {
+  media: AudioType | VideoType;
+  setMedia: (media: AudioType | VideoType) => void;
+  setMediaProvider: (mediaProvider: HTMLAudioElement | null) => void;
+  wavesurfer: WaveSurfer;
+  setRef: (ref: any) => void;
+  initialized: boolean;
+  currentTime: number;
+  currentSegmentIndex: number;
+  setCurrentSegmentIndex: (index: number) => void;
+  zoomRatio: number;
+};
+
+export const WavesurferContext = createContext<WavesurferContextType>(null);
+
+export const WavesurferProvider = ({
+  children,
+}: {
+  children: React.ReactNode;
+}) => {
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+
+  const [media, setMedia] = useState<AudioType | VideoType>(null);
+  const [mediaProvider, setMediaProvider] = useState<HTMLAudioElement | null>(
+    null
+  );
+  const [wavesurfer, setWavesurfer] = useState(null);
+  const [regions, setRegions] = useState<Regions | null>(null);
+  const [ref, setRef] = useState(null);
+
+  // Player state
+  const [initialized, setInitialized] = useState<boolean>(false);
+  const [currentTime, setCurrentTime] = useState<number>(0);
+  const [seek, setSeek] = useState<{
+    seekTo: number;
+    timestamp: number;
+  }>();
+  const [currentSegmentIndex, setCurrentSegmentIndex] = useState<number>(0);
+  const [zoomRatio, setZoomRatio] = useState<number>(1.0);
+  const [isPlaying, setIsPlaying] = useState(false);
+  const [playMode, setPlayMode] = useState<"loop" | "single" | "all">("all");
+  const [playBackRate, setPlaybackRate] = useState<number>(1);
+  const [displayInlineCaption, setDisplayInlineCaption] =
+    useState<boolean>(true);
+
+  const initializeWavesurfer = async () => {
+    if (!media) return;
+    if (!mediaProvider) return;
+    if (!ref.current) return;
+
+    const waveform = await EnjoyApp.waveforms.find(media.md5);
+    const ws = WaveSurfer.create({
+      container: ref.current,
+      height: 250,
+      waveColor: "#eee",
+      progressColor: "rgba(0, 0, 0, 0.15)",
+      cursorColor: "#aaa",
+      barWidth: 2,
+      autoScroll: true,
+      minPxPerSec: 150,
+      autoCenter: false,
+      dragToSeek: false,
+      media: mediaProvider,
+      peaks: waveform ? [waveform.peaks] : undefined,
+      duration: waveform ? waveform.duration : undefined,
+    });
+
+    const blob = await fetch(media.src).then((res) => res.blob());
+
+    if (waveform) {
+      ws.loadBlob(blob, [waveform.peaks], waveform.duration);
+      setInitialized(true);
+    } else {
+      ws.loadBlob(blob);
+    }
+
+    // Set up region plugin
+    setRegions(ws.registerPlugin(Regions.create()));
+
+    setWavesurfer(ws);
+  };
+
+  /*
+   * Initialize wavesurfer when container ref is available
+   * and mediaProvider is available
+   */
+  useEffect(() => {
+    initializeWavesurfer();
+  }, [media, ref, mediaProvider]);
+
+  /*
+   * When wavesurfer is initialized,
+   * set up event listeners for wavesurfer
+   * and clean up when component is unmounted
+   */
+  useEffect(() => {
+    if (!wavesurfer) return;
+
+    setCurrentTime(0);
+    setIsPlaying(false);
+
+    const subscriptions = [
+      wavesurfer.on("play", () => setIsPlaying(true)),
+      wavesurfer.on("pause", () => setIsPlaying(false)),
+      wavesurfer.on("loading", (percent: number) => console.log(`${percent}%`)),
+      wavesurfer.on("timeupdate", (time: number) => setCurrentTime(time)),
+      wavesurfer.on("decode", () => {
+        const peaks: Float32Array = wavesurfer
+          .getDecodedData()
+          .getChannelData(0);
+        const duration: number = wavesurfer.getDuration();
+        const sampleRate = wavesurfer.options.sampleRate;
+        const _frequencies = extractFrequencies({ peaks, sampleRate });
+        const _waveform = {
+          peaks: Array.from(peaks),
+          duration,
+          sampleRate,
+          frequencies: _frequencies,
+        };
+        EnjoyApp.waveforms.save(media.md5, _waveform);
+      }),
+      wavesurfer.on("ready", () => {
+        setInitialized(true);
+      }),
+    ];
+
+    return () => {
+      subscriptions.forEach((unsub) => unsub());
+    };
+  }, [wavesurfer]);
+
+  /*
+   * When regions are available,
+   * set up event listeners for regions
+   * and clean up when component is unmounted
+   */
+  useEffect(() => {
+    if (!regions) return;
+
+    const subscriptions = [
+      wavesurfer.on("finish", () => {
+        if (playMode !== "loop") return;
+
+        regions?.getRegions()[0]?.play();
+      }),
+
+      regions.on("region-created", (region: RegionType) => {
+        region.on("click", () => {
+          wavesurfer.play(region.start, region.end);
+        });
+      }),
+    ];
+
+    return () => {
+      subscriptions.forEach((unsub) => unsub());
+    };
+  });
+
+  return (
+    <WavesurferContext.Provider
+      value={{
+        media,
+        setMedia,
+        setMediaProvider,
+        wavesurfer,
+        setRef,
+        initialized,
+        currentTime,
+        currentSegmentIndex,
+        setCurrentSegmentIndex,
+        zoomRatio,
+      }}
+    >
+      {children}
+    </WavesurferContext.Provider>
+  );
+};
--- a/enjoy/src/renderer/hooks/index.ts
+++ b/enjoy/src/renderer/hooks/index.ts
@@ -1,3 +1,10 @@
+export * from './use-recordings';
+
 export * from './use-transcribe';
+export * from './use-transcriptions';
+
 export * from './use-ai-command';
 export * from './use-conversation';
+
+export * from './use-audio';
+export * from './use-video';
--- a/enjoy/src/renderer/hooks/use-audio.tsx
+++ b/enjoy/src/renderer/hooks/use-audio.tsx
@@ -0,0 +1,43 @@
+import { useEffect, useContext, useState } from "react";
+import {
+  DbProviderContext,
+  AppSettingsProviderContext,
+} from "@renderer/context";
+import { toast } from "@renderer/components/ui";
+import { t } from "i18next";
+
+export const useAudio = (options: { id?: string; md5?: string }) => {
+  const { id, md5 } = options;
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
+  const [audio, setAudio] = useState<AudioType>(null);
+
+  const onAudioUpdate = (event: CustomEvent) => {
+    const { model, action, record } = event.detail || {};
+    if (model !== "Audio") return;
+    if (record?.id != audio?.id) return;
+    if (action !== "update") return;
+
+    setAudio(record);
+  };
+
+  useEffect(() => {
+    const where = id ? { id } : { md5 };
+    EnjoyApp.audios.findOne(where).then((audio) => {
+      if (audio) {
+        setAudio(audio);
+      } else {
+        toast.error(t("models.audio.notFound"));
+      }
+    });
+
+    addDblistener(onAudioUpdate);
+    return () => {
+      removeDbListener(onAudioUpdate);
+    };
+  }, [id, md5]);
+
+  return {
+    audio,
+  };
+};
--- a/enjoy/src/renderer/hooks/use-recordings.tsx
+++ b/enjoy/src/renderer/hooks/use-recordings.tsx
@@ -0,0 +1,101 @@
+import { useState, useContext, useEffect, useRef, useReducer } from "react";
+import {
+  AppSettingsProviderContext,
+  DbProviderContext,
+} from "@renderer/context";
+import { recordingsReducer } from "@renderer/reducers";
+
+export const useRecordings = (
+  media: AudioType | VideoType,
+  referenceId: number
+) => {
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
+  const [recordings, dispatchRecordings] = useReducer(recordingsReducer, []);
+  const [loading, setLoading] = useState(false);
+  const [hasMore, setHasMore] = useState(true);
+
+  const fetchRecordings = async (offset = 0) => {
+    setLoading(true);
+
+    const limit = 10;
+    EnjoyApp.recordings
+      .findAll({
+        limit,
+        offset,
+        where: {
+          targetId: media.id,
+          targetType: media.mediaType,
+          referenceId,
+        },
+      })
+      .then((_recordings) => {
+        if (_recordings.length < limit) {
+          setHasMore(false);
+        } else {
+          setHasMore(true);
+        }
+
+        dispatchRecordings({
+          type: offset === 0 ? "set" : "append",
+          records: _recordings,
+        });
+      })
+      .finally(() => {
+        setLoading(false);
+      });
+  };
+
+  const onRecordingsUpdate = (event: CustomEvent) => {
+    const { model, action, record } = event.detail || {};
+
+    if (model === "PronunciationAssessment" && action === "create") {
+      const recording = recordings.find((r) => r.id === record.targetId);
+      if (!recording) return;
+
+      recording.pronunciationAssessment = record;
+      dispatchRecordings({
+        type: "update",
+        record: recording,
+      });
+    }
+
+    if (model != "Recording") return;
+
+    if (action === "destroy") {
+      dispatchRecordings({
+        type: "destroy",
+        record,
+      });
+    } else if (action === "create") {
+      if ((record as RecordingType).targetId !== media.id) return;
+      if ((record as RecordingType).referenceId !== referenceId) return;
+
+      dispatchRecordings({
+        type: "create",
+        record,
+      });
+    }
+  };
+
+  useEffect(() => {
+    addDblistener(onRecordingsUpdate);
+
+    return () => {
+      removeDbListener(onRecordingsUpdate);
+    };
+  }, [recordings]);
+
+  useEffect(() => {
+    if (!media) return;
+
+    fetchRecordings(0);
+  }, [media, referenceId]);
+
+  return {
+    recordings,
+    hasMore,
+    fetchRecordings,
+    loading,
+  };
+};
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -12,11 +12,10 @@ import * as sdk from "microsoft-cognitiveservices-speech-sdk";
 import axios from "axios";
 import take from "lodash/take";
 import sortedUniqBy from "lodash/sortedUniqBy";
-import {
-  groupTranscription,
-  END_OF_WORD_REGEX,
-  milisecondsToTimestamp,
-} from "@/utils";
+import { groupTranscription, milisecondsToTimestamp } from "@/utils";
+import { END_OF_SENTENCE_REGEX } from "@/constants";
+import { AlignmentResult } from "echogarden/dist/api/API.d.js";
+import { FFMPEG_CONVERT_WAV_OPTIONS } from "@/constants";

 export const useTranscribe = () => {
  const { EnjoyApp, ffmpegWasm, ffmpegValid, user, webApi } = useContext(
@@ -28,12 +27,16 @@ export const useTranscribe = () => {
    if (ffmpegValid) {
      if (src instanceof Blob) {
        src = await EnjoyApp.cacheObjects.writeFile(
-          `${Date.now()}.${src.type.split("/")[1]}`,
+          `${Date.now()}.${src.type.split("/")[1].split(";")[0]}`,
          await src.arrayBuffer()
        );
      }

-      const output = `enjoy://library/cache/${src.split("/").pop()}.wav`;
+      const output = `enjoy://library/cache/${src
+        .split("/")
+        .pop()
+        .split(";")
+        .shift()}.wav`;
      await EnjoyApp.ffmpeg.transcode(src, output, options);
      const data = await fetchFile(output);
      return new Blob([data], { type: "audio/wav" });
@@ -45,7 +48,7 @@ export const useTranscribe = () => {
  const transcodeUsingWasm = async (src: string | Blob, options?: string[]) => {
    if (!ffmpegWasm?.loaded) return;

-    options = options || ["-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le"];
+    options = options || FFMPEG_CONVERT_WAV_OPTIONS;

    try {
      let uri: URL;
@@ -80,21 +83,32 @@ export const useTranscribe = () => {
  ): Promise<{
    engine: string;
    model: string;
-    result: TranscriptionResultSegmentGroupType[];
+    alignmentResult: AlignmentResult;
  }> => {
    const blob = await transcode(mediaSrc);

+    let result;
    if (whisperConfig.service === "local") {
-      return transcribeByLocal(blob);
+      result = await transcribeByLocal(blob);
    } else if (whisperConfig.service === "cloudflare") {
-      return transcribeByCloudflareAi(blob);
+      result = await transcribeByCloudflareAi(blob);
    } else if (whisperConfig.service === "openai") {
-      return transcribeByOpenAi(blob);
+      result = await transcribeByOpenAi(blob);
    } else if (whisperConfig.service === "azure") {
-      return transcribeByAzureAi(blob, params);
+      result = await transcribeByAzureAi(blob, params);
    } else {
      throw new Error(t("whisperServiceNotSupported"));
    }
+
+    const alignmentResult = await EnjoyApp.echogarden.align(
+      new Uint8Array(await blob.arrayBuffer()),
+      result.result.map((segment) => segment.text).join(" ")
+    );
+
+    return {
+      ...result,
+      alignmentResult,
+    };
  };

  const transcribeByLocal = async (blob: Blob) => {
@@ -267,7 +281,7 @@ export const useTranscribe = () => {

            if (
              index === best.Words.length - 1 &&
-              !text.trim().match(END_OF_WORD_REGEX)
+              !text.trim().match(END_OF_SENTENCE_REGEX)
            ) {
              text = text + ".";
            }
--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -0,0 +1,192 @@
+import { useState, useContext, useEffect } from "react";
+import { useTranscribe } from "@renderer/hooks";
+import {
+  AISettingsProviderContext,
+  AppSettingsProviderContext,
+  DbProviderContext,
+} from "@renderer/context";
+import { toast } from "@renderer/components/ui";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
+import { MAGIC_TOKEN_REGEX, END_OF_SENTENCE_REGEX } from "@/constants";
+
+export const useTranscriptions = (media: AudioType | VideoType) => {
+  const { whisperConfig } = useContext(AISettingsProviderContext);
+  const { EnjoyApp, webApi } = useContext(AppSettingsProviderContext);
+  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
+  const [transcription, setTranscription] = useState<TranscriptionType>(null);
+  const { transcribe } = useTranscribe();
+  const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
+  const [transcribing, setTranscribing] = useState<boolean>(false);
+
+  const onTransactionUpdate = (event: CustomEvent) => {
+    const { model, action, record } = event.detail || {};
+    if (
+      model === "Transcription" &&
+      record.id === transcription.id &&
+      action === "update"
+    ) {
+      setTranscription(record);
+    }
+  };
+  const findOrCreateTranscription = async () => {
+    if (!media) return;
+    if (transcription) return;
+
+    return EnjoyApp.transcriptions
+      .findOrCreate({
+        targetId: media.id,
+        targetType: media.mediaType,
+      })
+      .then((t) => {
+        if (t.result && !t.result["transcript"]) {
+          t.result = null;
+        }
+        setTranscription(t);
+      })
+      .catch((err) => {
+        toast.error(err.message);
+      });
+  };
+
+  const generateTranscription = async () => {
+    if (transcribing) return;
+    if (!transcription) {
+      await findOrCreateTranscription();
+    }
+
+    setTranscribing(true);
+    setTranscribingProgress(0);
+    try {
+      const { engine, model, alignmentResult } = await transcribe(media.src, {
+        targetId: media.id,
+        targetType: media.mediaType,
+      });
+
+      let timeline: TimelineEntry[] = [];
+      if (alignmentResult) {
+        alignmentResult.timeline.forEach((t) => {
+          if (t.type === "sentence") {
+            timeline.push(t);
+          } else {
+            t.timeline.forEach((st) => {
+              timeline.push(st);
+            });
+          }
+        });
+      }
+
+      /*
+       * Pre-process
+       * Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc
+       */
+      timeline.forEach((sentence, i) => {
+        const nextSentence = timeline[i + 1];
+        if (
+          !sentence.text
+            .replaceAll(MAGIC_TOKEN_REGEX, "")
+            .match(END_OF_SENTENCE_REGEX) &&
+          nextSentence?.text
+        ) {
+          console.log(sentence.text);
+          nextSentence.text = [sentence.text, nextSentence.text].join(" ");
+          nextSentence.timeline = [
+            ...sentence.timeline,
+            ...nextSentence.timeline,
+          ];
+          nextSentence.startTime = sentence.startTime;
+          timeline.splice(i, 1);
+        }
+      });
+
+      await EnjoyApp.transcriptions.update(transcription.id, {
+        state: "finished",
+        result: {
+          timeline: timeline,
+          transcript: alignmentResult.transcript,
+        },
+        engine,
+        model,
+      });
+    } catch (err) {
+      toast.error(err.message);
+    }
+
+    setTranscribing(false);
+  };
+
+  const findTranscriptionFromWebApi = async () => {
+    if (!transcription) {
+      await findOrCreateTranscription();
+    }
+
+    const res = await webApi.transcriptions({
+      targetMd5: media.md5,
+    });
+
+    const transcript = (res?.transcriptions || []).filter((t) =>
+      ["base", "small", "medium", "large", "whisper-1"].includes(t.model)
+    )?.[0];
+
+    if (!transcript) {
+      return Promise.reject("Transcription not found");
+    }
+
+    if (!transcript.result["transcript"]) {
+      return Promise.reject("Transcription not aligned");
+    }
+
+    return EnjoyApp.transcriptions.update(transcription.id, {
+      state: "finished",
+      result: transcript.result,
+      engine: transcript.engine,
+      model: transcript.model,
+    });
+  };
+
+  const findOrGenerateTranscription = async () => {
+    try {
+      await findTranscriptionFromWebApi();
+    } catch (err) {
+      console.error(err);
+      await generateTranscription();
+    }
+  };
+
+  useEffect(() => {
+    if (!media) return;
+
+    findOrCreateTranscription();
+  }, [media]);
+
+  useEffect(() => {
+    if (!transcription) return;
+
+    addDblistener(onTransactionUpdate);
+
+    if (
+      transcription.state == "pending" ||
+      !transcription.result?.["transcript"]
+    ) {
+      findOrGenerateTranscription();
+    }
+
+    if (whisperConfig.service === "local") {
+      EnjoyApp.whisper.onProgress((_, p: number) => {
+        if (p > 100) p = 100;
+        setTranscribingProgress(p);
+      });
+    }
+
+    return () => {
+      removeDbListener(onTransactionUpdate);
+      EnjoyApp.whisper.removeProgressListeners();
+    };
+  }, [transcription, media]);
+
+  return {
+    transcription,
+    transcribingProgress,
+    transcribing,
+    generateTranscription,
+  };
+};
--- a/enjoy/src/renderer/hooks/use-video.tsx
+++ b/enjoy/src/renderer/hooks/use-video.tsx
@@ -0,0 +1,43 @@
+import { useEffect, useContext, useState } from "react";
+import {
+  DbProviderContext,
+  AppSettingsProviderContext,
+} from "@renderer/context";
+import { toast } from "@renderer/components/ui";
+import { t } from "i18next";
+
+export const useVideo = (options: { id?: string; md5?: string }) => {
+  const { id, md5 } = options;
+  const { EnjoyApp } = useContext(AppSettingsProviderContext);
+  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
+  const [video, setVideo] = useState<VideoType>(null);
+
+  const onAudioUpdate = (event: CustomEvent) => {
+    const { model, action, record } = event.detail || {};
+    if (model !== "Audio") return;
+    if (record?.id != video?.id) return;
+    if (action !== "update") return;
+
+    setVideo(record);
+  };
+
+  useEffect(() => {
+    const where = id ? { id } : { md5 };
+    EnjoyApp.videos.findOne(where).then((video) => {
+      if (video) {
+        setVideo(video);
+      } else {
+        toast.error(t("models.video.notFound"));
+      }
+    });
+
+    addDblistener(onAudioUpdate);
+    return () => {
+      removeDbListener(onAudioUpdate);
+    };
+  }, [id, md5]);
+
+  return {
+    video,
+  };
+};
--- a/enjoy/src/renderer/lib/utils.ts
+++ b/enjoy/src/renderer/lib/utils.ts
@@ -10,6 +10,7 @@ import i18next, { t } from "i18next";
 dayjs.extend(localizedFormat);
 dayjs.extend(duration);
 dayjs.extend(relativeTime);
+import Chart from "chart.js/auto";

 export function cn(...inputs: ClassValue[]) {
  return twMerge(clsx(inputs));
@@ -37,7 +38,8 @@ export function formatDuration(
  format = "HH:mm:ss"
 ) {
  dayjs.locale(i18next.resolvedLanguage?.toLowerCase() || "en");
-  return dayjs.duration(duration, unit).format(format);
+  const display = dayjs.duration(duration, unit).format(format);
+  return display.replace(/^00:/, "");
 }

 export function bytesToSize(bytes: number) {
@@ -78,3 +80,60 @@ export function formatDate(date: string | Date) {
    return then.fromNow();
  }
 }
+
+export function renderPitchContour(options: {
+  wrapper: HTMLElement;
+  canvasId: string;
+  labels: string[];
+  datasets: Chart["data"]["datasets"];
+}) {
+  const { wrapper, datasets, labels, canvasId } = options;
+
+  const width = wrapper.getBoundingClientRect().width;
+  const height = wrapper.getBoundingClientRect().height;
+  const canvas = document.createElement("canvas");
+  canvas.id = canvasId;
+  canvas.style.position = "absolute";
+  canvas.style.width = `${width}px`;
+  canvas.style.height = `${height}px`;
+  canvas.style.top = "0";
+  canvas.style.left = "0";
+
+  wrapper.appendChild(canvas);
+
+  new Chart(canvas, {
+    type: "line",
+    data: {
+      labels,
+      datasets,
+    },
+    options: {
+      plugins: {
+        legend: {
+          display: false,
+        },
+        title: {
+          display: false,
+        },
+      },
+      scales: {
+        x: {
+          beginAtZero: true,
+          ticks: {
+            autoSkip: false,
+          },
+          display: false,
+          grid: {
+            display: false,
+          },
+          border: {
+            display: false,
+          },
+        },
+        y: {
+          display: false,
+        },
+      },
+    },
+  });
+}
--- a/enjoy/src/renderer/pages/audio.tsx
+++ b/enjoy/src/renderer/pages/audio.tsx
@@ -1,8 +1,9 @@
-import { useParams , useNavigate } from "react-router-dom";
-import { AudioDetail } from "@renderer/components";
+import { useParams, useNavigate } from "react-router-dom";
+import { AudioPlayer } from "@renderer/components";
 import { Button } from "@renderer/components/ui";
 import { ChevronLeftIcon } from "lucide-react";
 import { t } from "i18next";
+import { MediaPlayerProvider } from "@renderer/context";

 export default () => {
  const navigate = useNavigate();
@@ -10,15 +11,17 @@ export default () => {

  return (
    <>
-      <div className="h-full px-4 py-6 xl:px-8">
-        <div className="flex space-x-1 items-center mb-4">
+      <div className="h-full relative">
+        <div className="flex space-x-1 items-center h-14 px-4 xl:px-8">
          <Button variant="ghost" size="icon" onClick={() => navigate(-1)}>
            <ChevronLeftIcon className="w-5 h-5" />
          </Button>
          <span>{t("shadowingAudio")}</span>
        </div>

-        <AudioDetail id={id} />
+        <MediaPlayerProvider>
+          <AudioPlayer id={id} />
+        </MediaPlayerProvider>
      </div>
    </>
  );
--- a/enjoy/src/renderer/pages/conversation.tsx
+++ b/enjoy/src/renderer/pages/conversation.tsx
@@ -15,6 +15,7 @@ import { t } from "i18next";
 import {
  DbProviderContext,
  AppSettingsProviderContext,
+  MediaPlayerProvider,
 } from "@renderer/context";
 import { messagesReducer } from "@renderer/reducers";
 import { v4 as uuidv4 } from "uuid";
@@ -249,52 +250,54 @@ export default () => {
          </Sheet>
        </div>

-        <ScrollArea ref={containerRef} className="px-4 flex-1">
-          <div className="messages flex flex-col-reverse gap-6 my-6">
-            <div className="w-full h-16"></div>
-            {messages.map((message) => (
-              <MessageComponent
-                key={message.id}
-                message={message}
-                configuration={{
-                  type: conversation.type,
-                  ...conversation.configuration,
-                }}
-                onResend={() => {
-                  if (message.status === "error") {
-                    dispatchMessages({ type: "destroy", record: message });
-                  }
+        <MediaPlayerProvider>
+          <ScrollArea ref={containerRef} className="px-4 flex-1">
+            <div className="messages flex flex-col-reverse gap-6 my-6">
+              <div className="w-full h-16"></div>
+              {messages.map((message) => (
+                <MessageComponent
+                  key={message.id}
+                  message={message}
+                  configuration={{
+                    type: conversation.type,
+                    ...conversation.configuration,
+                  }}
+                  onResend={() => {
+                    if (message.status === "error") {
+                      dispatchMessages({ type: "destroy", record: message });
+                    }

-                  handleSubmit(message.content);
-                }}
-                onRemove={() => {
-                  if (message.status === "error") {
-                    dispatchMessages({ type: "destroy", record: message });
-                  } else {
-                    EnjoyApp.messages.destroy(message.id).catch((err) => {
-                      toast.error(err.message);
-                    });
-                  }
-                }}
-              />
-            ))}
-            {offset > -1 && (
-              <div className="flex justify-center">
-                <Button
-                  variant="ghost"
-                  onClick={() => fetchMessages()}
-                  disabled={loading || offset === -1}
-                  className="px-4 py-2"
-                >
-                  {t("loadMore")}
-                  {loading && (
-                    <LoaderIcon className="h-4 w-4 animate-spin ml-2" />
-                  )}
-                </Button>
-              </div>
-            )}
-          </div>
-        </ScrollArea>
+                    handleSubmit(message.content);
+                  }}
+                  onRemove={() => {
+                    if (message.status === "error") {
+                      dispatchMessages({ type: "destroy", record: message });
+                    } else {
+                      EnjoyApp.messages.destroy(message.id).catch((err) => {
+                        toast.error(err.message);
+                      });
+                    }
+                  }}
+                />
+              ))}
+              {offset > -1 && (
+                <div className="flex justify-center">
+                  <Button
+                    variant="ghost"
+                    onClick={() => fetchMessages()}
+                    disabled={loading || offset === -1}
+                    className="px-4 py-2"
+                  >
+                    {t("loadMore")}
+                    {loading && (
+                      <LoaderIcon className="h-4 w-4 animate-spin ml-2" />
+                    )}
+                  </Button>
+                </div>
+              )}
+            </div>
+          </ScrollArea>
+        </MediaPlayerProvider>

        <div className="px-4 absolute w-full bottom-0 left-0 h-14 bg-muted z-50">
          <div className="focus-within:bg-background px-4 py-2 flex items-center space-x-4 rounded-lg border">
--- a/enjoy/src/renderer/pages/conversations.tsx
+++ b/enjoy/src/renderer/pages/conversations.tsx
@@ -64,12 +64,12 @@ export default () => {

  const presets = CONVERSATION_PRESETS.map((preset) =>
    Object.assign({}, preset, {
-      engine: currentEngine.name,
+      engine: currentEngine?.name,
      configuration: {
        ...preset.configuration,
        tts: {
          ...preset.configuration.tts,
-          engine: currentEngine.name,
+          engine: currentEngine?.name,
        },
      },
    })
@@ -78,7 +78,7 @@ export default () => {
  const customPreset = {
    key: "custom",
    name: t("custom"),
-    engine: currentEngine.name,
+    engine: currentEngine?.name,
    configuration: {
      type: "gpt",
      model: "gpt-4-turbo-preview",
@@ -92,7 +92,7 @@ export default () => {
      historyBufferSize: 0,
      tts: {
        baseUrl: "",
-        engine: currentEngine.name,
+        engine: currentEngine?.name,
        model: "tts-1",
        voice: "alloy",
      },
@@ -107,7 +107,7 @@ export default () => {
      type: "tts",
      tts: {
        baseUrl: "",
-        engine: currentEngine.name,
+        engine: currentEngine?.name,
        model: "tts-1",
        voice: "alloy",
      },
--- a/enjoy/src/renderer/pages/video.tsx
+++ b/enjoy/src/renderer/pages/video.tsx
@@ -1,8 +1,9 @@
-import { useParams , useNavigate } from "react-router-dom";
-import { VideoDetail } from "@renderer/components";
+import { useParams, useNavigate } from "react-router-dom";
+import { VideoPlayer } from "@renderer/components";
 import { Button } from "@renderer/components/ui";
 import { ChevronLeftIcon } from "lucide-react";
 import { t } from "i18next";
+import { MediaPlayerProvider } from "@renderer/context";

 export default () => {
  const navigate = useNavigate();
@@ -10,15 +11,17 @@ export default () => {

  return (
    <>
-      <div className="h-full px-4 py-6 xl:px-8">
-        <div className="flex space-x-1 items-center mb-4">
+      <div className="h-full relative">
+        <div className="flex space-x-1 items-center h-14 px-4 xl:px-8">
          <Button variant="ghost" size="icon" onClick={() => navigate(-1)}>
            <ChevronLeftIcon className="w-5 h-5" />
          </Button>
          <span>{t("shadowingVideo")}</span>
        </div>

-        <VideoDetail id={id} />
+        <MediaPlayerProvider>
+          <VideoPlayer id={id} />
+        </MediaPlayerProvider>
      </div>
    </>
  );
--- a/enjoy/src/types/audio.d.ts
+++ b/enjoy/src/types/audio.d.ts
@@ -1,4 +1,5 @@
 type AudioType = {
+  mediaType: string,
  id: string;
  source: string;
  name: string;
--- a/enjoy/src/types/enjoy-app.d.ts
+++ b/enjoy/src/types/enjoy-app.d.ts
@@ -206,6 +206,14 @@ type EnjoyAppType = {
      }
    ) => Promise<SpeechType>;
  };
+  echogarden: {
+    align: (
+      input: string | Uint8Array,
+      transcript: string,
+      options?: any
+    ) => Promise<AlignmentResult>;
+    check: () => Promise<boolean>;
+  };
  whisper: {
    config: () => Promise<WhisperConfigType>;
    check: () => Promise<{ success: boolean; log: string }>;
--- a/enjoy/src/types/transcription.d.ts
+++ b/enjoy/src/types/transcription.d.ts
@@ -5,7 +5,7 @@ type TranscriptionType = {
  state: "pending" | "processing" | "finished";
  engine: string;
  model: string;
-  result: TranscriptionResultSegmentGroupType[];
+  result: AlignmentResult;
 };

 type TranscriptionResultSegmentType = {
--- a/enjoy/src/types/video.d.ts
+++ b/enjoy/src/types/video.d.ts
@@ -1,4 +1,5 @@
 type VideoType = {
+  mediaType: string,
  id: string;
  source: string;
  name: string;
--- a/enjoy/src/utils.ts
+++ b/enjoy/src/utils.ts
@@ -1,7 +1,19 @@
 import Pitchfinder from "pitchfinder";
+import { END_OF_SENTENCE_REGEX, MAGIC_TOKEN_REGEX } from "./constants";

-export function generatePitch(peaks: Float32Array, sampleRate: number) {
-  const detectPitch = Pitchfinder.YIN({ sampleRate });
+export const extractFrequencies = (props: {
+  peaks: Float32Array;
+  sampleRate: number;
+}): number[] => {
+  const { peaks, sampleRate } = props;
+
+  const detectPitch = Pitchfinder.AMDF({
+    sampleRate,
+    sensitivity: 0.05,
+    minFrequency: 100,
+    maxFrequency: 1000,
+    ratio: 5,
+  });
  const duration = peaks.length / sampleRate;
  const bpm = peaks.length / duration / 60;

@@ -10,24 +22,8 @@ export function generatePitch(peaks: Float32Array, sampleRate: number) {
    quantization: bpm,
  });

-  // Find the baseline frequency (the value that appears most often)
-  const frequencyMap: any = {};
-  let maxAmount = 0;
-  let baseFrequency = 0;
-  frequencies.forEach((frequency) => {
-    if (!frequency) return;
-    const tolerance = 10;
-    frequency = Math.round(frequency * tolerance) / tolerance;
-    if (!frequencyMap[frequency]) frequencyMap[frequency] = 0;
-    frequencyMap[frequency] += 1;
-    if (frequencyMap[frequency] > maxAmount) {
-      maxAmount = frequencyMap[frequency];
-      baseFrequency = frequency;
-    }
-  });
-
-  return { frequencies, baseFrequency };
-}
+  return frequencies;
+};

 export function milisecondsToTimestamp(ms: number) {
  const hours = Math.floor(ms / 3600000).toString();
@@ -40,8 +36,6 @@ export function milisecondsToTimestamp(ms: number) {
  )}:${seconds.padStart(2, "0")},${milliseconds}`;
 }

-export const MAGIC_TOKENS = ["Mrs.", "Ms.", "Mr.", "Dr.", "Prof.", "St."];
-export const END_OF_WORD_REGEX = /[^\.!,\?][\.!\?]/g;
 export const groupTranscription = (
  transcription: TranscriptionResultSegmentType[]
 ): TranscriptionResultSegmentGroupType[] => {
@@ -75,8 +69,8 @@ export const groupTranscription = (
    group.push(segment);

    if (
-      !MAGIC_TOKENS.includes(text) &&
-      segment.text.trim().match(END_OF_WORD_REGEX)
+      !text.match(MAGIC_TOKEN_REGEX) &&
+      segment.text.trim().match(END_OF_SENTENCE_REGEX)
    ) {
      // Group a complete sentence;
      groups.push(generateGroup(group));
--- a/enjoy/tailwind.config.js
+++ b/enjoy/tailwind.config.js
@@ -70,6 +70,7 @@ module.exports = {
  plugins: [
    require("tailwindcss-animate"),
    require("@tailwindcss/typography"),
+    require("tailwind-scrollbar"),
    require("tailwind-scrollbar-hide"),
    require("@vidstack/react/tailwind.cjs"),
  ],
--- a/enjoy/vite.main.config.ts
+++ b/enjoy/vite.main.config.ts
@@ -23,21 +23,11 @@ export default defineConfig((env) => {
        formats: ["es"],
      },
      rollupOptions: {
-        external,
-        // external: [
-        //   "axios",
-        //   "child_process",
-        //   "crypto",
-        //   "fs-extra",
-        //   "fs",
-        //   "path",
-        //   "sequelize",
-        //   "umzug",
-        //   "sqlite3",
-        //   "fluent-ffmpeg",
-        //   "ffmpeg-static",
-        //   "@andrkrn/ffprobe-static",
-        // ],
+        external: [...external, "echogarden/dist/api/API.js"],
+        output: {
+          strict: false,
+        },
+        plugins: [],
      },
      commonjsOptions: {
        transformMixedEsModules: true,
--- a/yarn.lock
+++ b/yarn.lock