diff --git a/enjoy/src/main/echogarden.ts b/enjoy/src/main/echogarden.ts
index 217ac5f4..48f9ab11 100644
--- a/enjoy/src/main/echogarden.ts
+++ b/enjoy/src/main/echogarden.ts
@@ -10,7 +10,11 @@ import {
   trimAudioEnd,
   AudioSourceParam,
 } from "echogarden/dist/audio/AudioUtilities.js";
-import { Timeline } from "echogarden/dist/utilities/Timeline.d.js";
+import { wordTimelineToSegmentSentenceTimeline } from "echogarden/dist/utilities/Timeline.js";
+import {
+  type Timeline,
+  type TimelineEntry,
+} from "echogarden/dist/utilities/Timeline.d.js";
 import path from "path";
 import log from "@main/logger";
 import url from "url";
@@ -43,6 +47,7 @@ class EchogardenWrapper {
   public getRawAudioDuration: typeof getRawAudioDuration;
   public trimAudioStart: typeof trimAudioStart;
   public trimAudioEnd: typeof trimAudioEnd;
+  public wordTimelineToSegmentSentenceTimeline: typeof wordTimelineToSegmentSentenceTimeline;
 
   constructor() {
     this.align = Echogarden.align;
@@ -54,6 +59,8 @@ class EchogardenWrapper {
     this.getRawAudioDuration = getRawAudioDuration;
     this.trimAudioStart = trimAudioStart;
     this.trimAudioEnd = trimAudioEnd;
+    this.wordTimelineToSegmentSentenceTimeline =
+      wordTimelineToSegmentSentenceTimeline;
   }
 
   async check() {
@@ -132,6 +139,37 @@ class EchogardenWrapper {
       }
     );
 
+    ipcMain.handle(
+      "echogarden-word-to-sentence-timeline",
+      async (
+        _event,
+        wordTimeline: Timeline,
+        transcript: string,
+        language: string
+      ) => {
+        logger.debug("echogarden-word-to-sentence-timeline:", transcript);
+
+        const { segmentTimeline } =
+          await this.wordTimelineToSegmentSentenceTimeline(
+            wordTimeline,
+            transcript,
+            language.split("-")[0]
+          );
+        const timeline: Timeline = [];
+        segmentTimeline.forEach((t: TimelineEntry) => {
+          if (t.type === "sentence") {
+            timeline.push(t);
+          } else {
+            t.timeline.forEach((st) => {
+              timeline.push(st);
+            });
+          }
+        });
+
+        return timeline;
+      }
+    );
+
     ipcMain.handle(
       "echogarden-transcode",
       async (_event, url: string, sampleRate?: number) => {
diff --git a/enjoy/src/main/whisper.ts b/enjoy/src/main/whisper.ts
index 287b257d..722af282 100644
--- a/enjoy/src/main/whisper.ts
+++ b/enjoy/src/main/whisper.ts
@@ -105,8 +105,6 @@ class Whipser {
         `--model "${model.savePath}"`,
         "--output-json",
         `--output-file "${path.join(tmpDir, "jfk")}"`,
-        `--split-on-word true`,
-        `--max-len 1`,
       ];
       logger.debug(`Checking whisper command: ${commands.join(" ")}`);
       exec(
@@ -205,9 +203,9 @@ class Whipser {
       "--print-progress",
       "--language",
       model.name.includes("en") ? "en" : language?.split("-")?.[0] || "auto",
-      `--split-on-word`,
-      `--max-len`,
-      "1",
+      // `--split-on-word`,
+      // `--max-len`,
+      // "1",
       ...extra,
     ];
 
diff --git a/enjoy/src/preload.ts b/enjoy/src/preload.ts
index 2d1789d5..d0782c10 100644
--- a/enjoy/src/preload.ts
+++ b/enjoy/src/preload.ts
@@ -441,7 +441,24 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
       return ipcRenderer.invoke("echogarden-align", input, transcript, options);
     },
     alignSegments: (input: string, timeline: Timeline, options: any) => {
-      return ipcRenderer.invoke("echogarden-align-segments", input, timeline, options);
+      return ipcRenderer.invoke(
+        "echogarden-align-segments",
+        input,
+        timeline,
+        options
+      );
+    },
+    wordToSentenceTimeline: (
+      wordTimeline: Timeline,
+      transcript: string,
+      language: string
+    ) => {
+      return ipcRenderer.invoke(
+        "echogarden-word-to-sentence-timeline",
+        wordTimeline,
+        transcript,
+        language
+      );
     },
     transcode: (input: string) => {
       return ipcRenderer.invoke("echogarden-transcode", input);
diff --git a/enjoy/src/renderer/components/medias/media-provider.tsx b/enjoy/src/renderer/components/medias/media-provider.tsx
index 74c426f6..5f84b467 100644
--- a/enjoy/src/renderer/components/medias/media-provider.tsx
+++ b/enjoy/src/renderer/components/medias/media-provider.tsx
@@ -51,11 +51,13 @@ export const MediaProvider = () => {
         language: transcription.result.language,
       })
     );
+  }, [player, transcription]);
 
+  useEffect(() => {
     return () => {
       setMediaProvider(null);
     };
-  }, [player, transcription]);
+  }, [media?.src]);
 
   if (!media?.src) return null;
 
diff --git a/enjoy/src/renderer/context/media-player-provider.tsx b/enjoy/src/renderer/context/media-player-provider.tsx
index 4f6ef78d..882cc69b 100644
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -559,7 +559,7 @@ export const MediaPlayerProvider = ({
       setDecoded(false);
       setDecodeError(null);
     };
-  }, [media?.src, ref, mediaProvider, layout?.playerHeight]);
+  }, [media?.src, ref?.current, mediaProvider, layout?.playerHeight]);
 
   /* cache last segment index */
   useEffect(() => {
diff --git a/enjoy/src/renderer/hooks/use-transcribe.tsx b/enjoy/src/renderer/hooks/use-transcribe.tsx
index d2db50cf..464d3007 100644
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -19,82 +19,10 @@ import take from "lodash/take";
 import sortedUniqBy from "lodash/sortedUniqBy";
 import { parseText } from "media-captions";
 
-/*
- * define the regex pattern to match the end of a sentence
- * the end of a sentence is defined as a period, question mark, or exclamation mark
- * also it may be followed by a quotation mark
- * and exclude sepecial cases like "Mr.", "Mrs.", "Dr.", "Ms.", "etc."
- */
-const sentenceEndPattern = /(?<!Mr|Mrs|Dr|Ms|etc)\.|\?|!\"?/;
-
 // test a text string has any punctuations or not
 // some transcribed text may not have any punctuations
 const punctuationsPattern = /\w[.,!?](\s|$)/g;
 
-/*
- * convert the word timeline to sentence timeline
- * a sentence is a group of words that ends with a punctuation
- */
-const wordTimelineToSentenceTimeline = (
-  wordTimeline: TimelineEntry[]
-): TimelineEntry[] => {
-  const timeline: TimelineEntry[] = [];
-
-  wordTimeline.forEach((word, index) => {
-    word.text = word.text.trim();
-    // skip empty words
-    if (!word.text) return;
-    // skip music or sound effects quoted in []
-    if (word.text.match(/^\[.*\]$/)) return;
-
-    const wordEntry = {
-      type: "word" as TimelineEntryType,
-      text: word.text,
-      startTime: word.startTime,
-      endTime: word.endTime,
-    };
-
-    let sentence: TimelineEntry;
-    // get the last sentence in the timeline
-    if (timeline.length > 0) {
-      sentence = timeline[timeline.length - 1];
-    }
-
-    // if there is no sentence in the timeline, create a new sentence
-    // if last sentence is a punctuation, create a new sentence
-    if (!sentence || sentence.text.match(sentenceEndPattern)) {
-      sentence = {
-        type: "sentence" as TimelineEntryType,
-        text: "",
-        startTime: wordEntry.startTime,
-        endTime: wordEntry.endTime,
-        timeline: [],
-      };
-      timeline.push(sentence);
-    }
-
-    // if the word is a punctuation, add it to the sentence and start a new sentence
-    if (wordEntry.text.match(sentenceEndPattern)) {
-      sentence.text += wordEntry.text;
-      sentence.endTime = wordEntry.endTime;
-
-      const lastSentence = timeline[timeline.length - 1];
-      if (lastSentence.endTime !== sentence.endTime) {
-        timeline.push(sentence);
-      }
-    } else {
-      sentence.text += wordEntry.text + " ";
-      sentence.endTime = wordEntry.endTime;
-
-      if (index === wordTimeline.length - 1) {
-        timeline.push(sentence);
-      }
-    }
-  });
-
-  return timeline;
-};
-
 export const useTranscribe = () => {
   const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
   const { openai } = useContext(AISettingsProviderContext);
@@ -208,28 +136,11 @@ export const useTranscribe = () => {
           isolate,
         }
       );
-
-      wordTimeline.forEach((word: TimelineEntry) => {
-        let sentence = timeline.find(
-          (entry) =>
-            word.startTime >= entry.startTime && word.endTime <= entry.endTime
-        );
-
-        if (sentence) {
-          sentence.timeline.push(word);
-        }
-      });
-
-      /*
-       * the start time of a sentence should be the start time of the first word in the sentence
-       * the end time of a sentence should the end time of the last word in the sentence
-       */
-      // timeline.forEach((t) => {
-      //   if (t.timeline.length === 0) return;
-
-      //   t.startTime = t.timeline[0].startTime;
-      //   t.endTime = t.timeline[t.timeline.length - 1].endTime;
-      // });
+      timeline = await EnjoyApp.echogarden.wordToSentenceTimeline(
+        wordTimeline,
+        transcript,
+        language.split("-")[0]
+      );
     } else {
       // Remove all content inside `()`, `[]`, `{}` and trim the text
       // remove all markdown formatting
@@ -299,20 +210,34 @@ export const useTranscribe = () => {
       }
     );
 
-    const wordTimeline: TimelineEntry[] = res.transcription.map((word) => {
-      return {
-        type: "word" as TimelineEntryType,
-        text: word.text,
-        startTime: word.offsets.from / 1000.0,
-        endTime: word.offsets.to / 1000.0,
-      };
-    });
-    const timeline = wordTimelineToSentenceTimeline(wordTimeline);
+    const timeline: TimelineEntry[] = res.transcription
+      .map((segment) => {
+        // ignore the word if it is empty or in the format of `[xxx]` or `(xxx)`
+        if (
+          !segment.text.trim() ||
+          segment.text.trim().match(/^[\[\(].+[\]\)]$/)
+        ) {
+          return null;
+        }
+
+        return {
+          type: "segment" as TimelineEntryType,
+          text: segment.text.trim(),
+          startTime: segment.offsets.from / 1000.0,
+          endTime: segment.offsets.to / 1000.0,
+        };
+      })
+      .filter((s) => Boolean(s?.text));
+
+    const transcript = timeline
+      .map((segment) => segment.text)
+      .join(" ")
+      .trim();
 
     return {
       engine: "whisper",
       model: res.model.type,
-      text: res.transcription.map((segment) => segment.text).join(" "),
+      text: transcript,
       timeline,
     };
   };
@@ -337,14 +262,14 @@ export const useTranscribe = () => {
       file,
       model: "whisper-1",
       response_format: "verbose_json",
-      timestamp_granularities: ["word"],
+      timestamp_granularities: ["segment"],
     })) as any;
 
     let timeline: TimelineEntry[] = [];
     if (res.segments) {
       res.segments.forEach((segment) => {
         const segmentTimeline = {
-          type: "sentence" as TimelineEntryType,
+          type: "segment" as TimelineEntryType,
           text: segment.text,
           startTime: segment.start,
           endTime: segment.end,
@@ -353,16 +278,6 @@ export const useTranscribe = () => {
 
         timeline.push(segmentTimeline);
       });
-    } else if (res.words) {
-      const wordTimeline = res.words.map((word) => {
-        return {
-          type: "word" as TimelineEntryType,
-          text: word.word,
-          startTime: word.start,
-          endTime: word.end,
-        };
-      });
-      timeline = wordTimelineToSentenceTimeline(wordTimeline);
     }
 
     return {
@@ -390,15 +305,16 @@ export const useTranscribe = () => {
       })
     ).data;
 
-    const wordTimeline = res.words.map((word) => {
+    const caption = await parseText(res.vtt, { type: "vtt" });
+    const timeline: Timeline = caption.cues.map((cue) => {
       return {
-        type: "word" as TimelineEntryType,
-        text: word.word,
-        startTime: word.start,
-        endTime: word.end,
+        type: "segment",
+        text: cue.text,
+        startTime: cue.startTime,
+        endTime: cue.endTime,
+        timeline: [],
       };
     });
-    const timeline = wordTimelineToSentenceTimeline(wordTimeline);
 
     return {
       engine: "cloudflare",
@@ -435,7 +351,13 @@ export const useTranscribe = () => {
 
     let results: SpeechRecognitionResultType[] = [];
 
-    return new Promise((resolve, reject) => {
+    const res: {
+      engine: string;
+      model: string;
+      text: string;
+      tokenId: number;
+      timeline?: TimelineEntry[];
+    } = await new Promise((resolve, reject) => {
       reco.recognizing = (_s, e) => {
         setOutput(e.result.text);
       };
@@ -454,44 +376,41 @@ export const useTranscribe = () => {
         }
 
         reco.stopContinuousRecognitionAsync();
+        console.log("CANCELED: Reason=" + e.reason);
       };
 
-      reco.sessionStopped = (_s, _e) => {
+      reco.sessionStopped = async (_s, e) => {
+        console.log(
+          "Session stopped. Stop continuous recognition.",
+          e.sessionId,
+          results
+        );
         reco.stopContinuousRecognitionAsync();
 
-        const wordTimeline: TimelineEntry[] = [];
+        const transcript = results
+          .map((result) => result.DisplayText)
+          .join(" ")
+          .trim();
+
+        const timeline: Timeline = [];
         results.forEach((result) => {
           const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
-          const splitedWords = best.Display.trim().split(" ");
+          const firstWord = best.Words[0];
+          const lastWord = best.Words[best.Words.length - 1];
 
-          best.Words.forEach((word, index) => {
-            let text = word.Word;
-            if (splitedWords.length === best.Words.length) {
-              text = splitedWords[index];
-            }
-
-            if (
-              index === best.Words.length - 1 &&
-              !text.trim().match(sentenceEndPattern)
-            ) {
-              text = text + ".";
-            }
-
-            wordTimeline.push({
-              type: "word" as TimelineEntryType,
-              text,
-              startTime: word.Offset / 10000000.0,
-              endTime: (word.Offset + word.Duration) / 10000000.0,
-            });
+          timeline.push({
+            type: "sentence",
+            text: best.Display,
+            startTime: firstWord.Offset / 10000000.0,
+            endTime: (lastWord.Offset + lastWord.Duration) / 10000000.0,
+            timeline: [],
           });
         });
 
-        const timeline = wordTimelineToSentenceTimeline(wordTimeline);
-
         resolve({
           engine: "azure",
           model: "whisper",
-          text: results.map((result) => result.DisplayText).join(" "),
+          text: transcript,
           timeline,
           tokenId: id,
         });
@@ -499,6 +418,8 @@ export const useTranscribe = () => {
 
       reco.startContinuousRecognitionAsync();
     });
+
+    return res;
   };
 
   return {
diff --git a/enjoy/src/renderer/hooks/use-transcriptions.tsx b/enjoy/src/renderer/hooks/use-transcriptions.tsx
index 61095acc..41bbc4c1 100644
--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -11,7 +11,7 @@ import { MAGIC_TOKEN_REGEX, END_OF_SENTENCE_REGEX } from "@/constants";
 
 export const useTranscriptions = (media: AudioType | VideoType) => {
   const { whisperConfig } = useContext(AISettingsProviderContext);
-  const { EnjoyApp, webApi, learningLanguage } = useContext(
+  const { EnjoyApp, learningLanguage } = useContext(
     AppSettingsProviderContext
   );
   const { addDblistener, removeDbListener } = useContext(DbProviderContext);
diff --git a/enjoy/src/types/enjoy-app.d.ts b/enjoy/src/types/enjoy-app.d.ts
index afccd064..3b93f8bb 100644
--- a/enjoy/src/types/enjoy-app.d.ts
+++ b/enjoy/src/types/enjoy-app.d.ts
@@ -257,6 +257,11 @@ type EnjoyAppType = {
       timeline: Timeline,
       options?: any
     ) => Promise<Timeline>;
+    wordToSentenceTimeline: (
+      wordTimeline: Timeline,
+      transcript: string,
+      language: string
+    ) => Promise<Timeline>;
     transcode: (input: string) => Promise<string>;
     check: () => Promise<boolean>;
   };
diff --git a/enjoy/src/types/index.d.ts b/enjoy/src/types/index.d.ts
index 9d5e5fb8..d4dd696e 100644
--- a/enjoy/src/types/index.d.ts
+++ b/enjoy/src/types/index.d.ts
@@ -77,6 +77,7 @@ type WhisperOutputType = {
 
 type CfWhipserOutputType = {
   text: string;
+  vtt: string;
   words_count: number;
   words: {
     word: string;
diff --git a/enjoy/vite.main.config.ts b/enjoy/vite.main.config.ts
index b2249ee0..df684736 100644
--- a/enjoy/vite.main.config.ts
+++ b/enjoy/vite.main.config.ts
@@ -27,6 +27,7 @@ export default defineConfig((env) => {
           ...external,
           "echogarden/dist/api/API.js",
           "echogarden/dist/audio/AudioUtilities.js",
+          "echogarden/dist/utilities/Timeline.js",
         ],
         output: {
           strict: false,