diff --git a/enjoy/src/main/echogarden.ts b/enjoy/src/main/echogarden.ts index 217ac5f4..48f9ab11 100644 --- a/enjoy/src/main/echogarden.ts +++ b/enjoy/src/main/echogarden.ts @@ -10,7 +10,11 @@ import { trimAudioEnd, AudioSourceParam, } from "echogarden/dist/audio/AudioUtilities.js"; -import { Timeline } from "echogarden/dist/utilities/Timeline.d.js"; +import { wordTimelineToSegmentSentenceTimeline } from "echogarden/dist/utilities/Timeline.js"; +import { + type Timeline, + type TimelineEntry, +} from "echogarden/dist/utilities/Timeline.d.js"; import path from "path"; import log from "@main/logger"; import url from "url"; @@ -43,6 +47,7 @@ class EchogardenWrapper { public getRawAudioDuration: typeof getRawAudioDuration; public trimAudioStart: typeof trimAudioStart; public trimAudioEnd: typeof trimAudioEnd; + public wordTimelineToSegmentSentenceTimeline: typeof wordTimelineToSegmentSentenceTimeline; constructor() { this.align = Echogarden.align; @@ -54,6 +59,8 @@ class EchogardenWrapper { this.getRawAudioDuration = getRawAudioDuration; this.trimAudioStart = trimAudioStart; this.trimAudioEnd = trimAudioEnd; + this.wordTimelineToSegmentSentenceTimeline = + wordTimelineToSegmentSentenceTimeline; } async check() { @@ -132,6 +139,37 @@ class EchogardenWrapper { } ); + ipcMain.handle( + "echogarden-word-to-sentence-timeline", + async ( + _event, + wordTimeline: Timeline, + transcript: string, + language: string + ) => { + logger.debug("echogarden-word-to-sentence-timeline:", transcript); + + const { segmentTimeline } = + await this.wordTimelineToSegmentSentenceTimeline( + wordTimeline, + transcript, + language.split("-")[0] + ); + const timeline: Timeline = []; + segmentTimeline.forEach((t: TimelineEntry) => { + if (t.type === "sentence") { + timeline.push(t); + } else { + t.timeline.forEach((st) => { + timeline.push(st); + }); + } + }); + + return timeline; + } + ); + ipcMain.handle( "echogarden-transcode", async (_event, url: string, sampleRate?: number) => { diff --git a/enjoy/src/main/whisper.ts b/enjoy/src/main/whisper.ts index 287b257d..722af282 100644 --- a/enjoy/src/main/whisper.ts +++ b/enjoy/src/main/whisper.ts @@ -105,8 +105,6 @@ class Whipser { `--model "${model.savePath}"`, "--output-json", `--output-file "${path.join(tmpDir, "jfk")}"`, - `--split-on-word true`, - `--max-len 1`, ]; logger.debug(`Checking whisper command: ${commands.join(" ")}`); exec( @@ -205,9 +203,9 @@ class Whipser { "--print-progress", "--language", model.name.includes("en") ? "en" : language?.split("-")?.[0] || "auto", - `--split-on-word`, - `--max-len`, - "1", + // `--split-on-word`, + // `--max-len`, + // "1", ...extra, ]; diff --git a/enjoy/src/preload.ts b/enjoy/src/preload.ts index 2d1789d5..d0782c10 100644 --- a/enjoy/src/preload.ts +++ b/enjoy/src/preload.ts @@ -441,7 +441,24 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", { return ipcRenderer.invoke("echogarden-align", input, transcript, options); }, alignSegments: (input: string, timeline: Timeline, options: any) => { - return ipcRenderer.invoke("echogarden-align-segments", input, timeline, options); + return ipcRenderer.invoke( + "echogarden-align-segments", + input, + timeline, + options + ); + }, + wordToSentenceTimeline: ( + wordTimeline: Timeline, + transcript: string, + language: string + ) => { + return ipcRenderer.invoke( + "echogarden-word-to-sentence-timeline", + wordTimeline, + transcript, + language + ); }, transcode: (input: string) => { return ipcRenderer.invoke("echogarden-transcode", input); diff --git a/enjoy/src/renderer/components/medias/media-provider.tsx b/enjoy/src/renderer/components/medias/media-provider.tsx index 74c426f6..5f84b467 100644 --- a/enjoy/src/renderer/components/medias/media-provider.tsx +++ b/enjoy/src/renderer/components/medias/media-provider.tsx @@ -51,11 +51,13 @@ export const MediaProvider = () => { language: transcription.result.language, }) ); + }, [player, transcription]); + useEffect(() => { return () => { setMediaProvider(null); }; - }, [player, transcription]); + }, [media?.src]); if (!media?.src) return null; diff --git a/enjoy/src/renderer/context/media-player-provider.tsx b/enjoy/src/renderer/context/media-player-provider.tsx index 4f6ef78d..882cc69b 100644 --- a/enjoy/src/renderer/context/media-player-provider.tsx +++ b/enjoy/src/renderer/context/media-player-provider.tsx @@ -559,7 +559,7 @@ export const MediaPlayerProvider = ({ setDecoded(false); setDecodeError(null); }; - }, [media?.src, ref, mediaProvider, layout?.playerHeight]); + }, [media?.src, ref?.current, mediaProvider, layout?.playerHeight]); /* cache last segment index */ useEffect(() => { diff --git a/enjoy/src/renderer/hooks/use-transcribe.tsx b/enjoy/src/renderer/hooks/use-transcribe.tsx index d2db50cf..464d3007 100644 --- a/enjoy/src/renderer/hooks/use-transcribe.tsx +++ b/enjoy/src/renderer/hooks/use-transcribe.tsx @@ -19,82 +19,10 @@ import take from "lodash/take"; import sortedUniqBy from "lodash/sortedUniqBy"; import { parseText } from "media-captions"; -/* - * define the regex pattern to match the end of a sentence - * the end of a sentence is defined as a period, question mark, or exclamation mark - * also it may be followed by a quotation mark - * and exclude sepecial cases like "Mr.", "Mrs.", "Dr.", "Ms.", "etc." - */ -const sentenceEndPattern = /(? { - const timeline: TimelineEntry[] = []; - - wordTimeline.forEach((word, index) => { - word.text = word.text.trim(); - // skip empty words - if (!word.text) return; - // skip music or sound effects quoted in [] - if (word.text.match(/^\[.*\]$/)) return; - - const wordEntry = { - type: "word" as TimelineEntryType, - text: word.text, - startTime: word.startTime, - endTime: word.endTime, - }; - - let sentence: TimelineEntry; - // get the last sentence in the timeline - if (timeline.length > 0) { - sentence = timeline[timeline.length - 1]; - } - - // if there is no sentence in the timeline, create a new sentence - // if last sentence is a punctuation, create a new sentence - if (!sentence || sentence.text.match(sentenceEndPattern)) { - sentence = { - type: "sentence" as TimelineEntryType, - text: "", - startTime: wordEntry.startTime, - endTime: wordEntry.endTime, - timeline: [], - }; - timeline.push(sentence); - } - - // if the word is a punctuation, add it to the sentence and start a new sentence - if (wordEntry.text.match(sentenceEndPattern)) { - sentence.text += wordEntry.text; - sentence.endTime = wordEntry.endTime; - - const lastSentence = timeline[timeline.length - 1]; - if (lastSentence.endTime !== sentence.endTime) { - timeline.push(sentence); - } - } else { - sentence.text += wordEntry.text + " "; - sentence.endTime = wordEntry.endTime; - - if (index === wordTimeline.length - 1) { - timeline.push(sentence); - } - } - }); - - return timeline; -}; - export const useTranscribe = () => { const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext); const { openai } = useContext(AISettingsProviderContext); @@ -208,28 +136,11 @@ export const useTranscribe = () => { isolate, } ); - - wordTimeline.forEach((word: TimelineEntry) => { - let sentence = timeline.find( - (entry) => - word.startTime >= entry.startTime && word.endTime <= entry.endTime - ); - - if (sentence) { - sentence.timeline.push(word); - } - }); - - /* - * the start time of a sentence should be the start time of the first word in the sentence - * the end time of a sentence should the end time of the last word in the sentence - */ - // timeline.forEach((t) => { - // if (t.timeline.length === 0) return; - - // t.startTime = t.timeline[0].startTime; - // t.endTime = t.timeline[t.timeline.length - 1].endTime; - // }); + timeline = await EnjoyApp.echogarden.wordToSentenceTimeline( + wordTimeline, + transcript, + language.split("-")[0] + ); } else { // Remove all content inside `()`, `[]`, `{}` and trim the text // remove all markdown formatting @@ -299,20 +210,34 @@ export const useTranscribe = () => { } ); - const wordTimeline: TimelineEntry[] = res.transcription.map((word) => { - return { - type: "word" as TimelineEntryType, - text: word.text, - startTime: word.offsets.from / 1000.0, - endTime: word.offsets.to / 1000.0, - }; - }); - const timeline = wordTimelineToSentenceTimeline(wordTimeline); + const timeline: TimelineEntry[] = res.transcription + .map((segment) => { + // ignore the word if it is empty or in the format of `[xxx]` or `(xxx)` + if ( + !segment.text.trim() || + segment.text.trim().match(/^[\[\(].+[\]\)]$/) + ) { + return null; + } + + return { + type: "segment" as TimelineEntryType, + text: segment.text.trim(), + startTime: segment.offsets.from / 1000.0, + endTime: segment.offsets.to / 1000.0, + }; + }) + .filter((s) => Boolean(s?.text)); + + const transcript = timeline + .map((segment) => segment.text) + .join(" ") + .trim(); return { engine: "whisper", model: res.model.type, - text: res.transcription.map((segment) => segment.text).join(" "), + text: transcript, timeline, }; }; @@ -337,14 +262,14 @@ export const useTranscribe = () => { file, model: "whisper-1", response_format: "verbose_json", - timestamp_granularities: ["word"], + timestamp_granularities: ["segment"], })) as any; let timeline: TimelineEntry[] = []; if (res.segments) { res.segments.forEach((segment) => { const segmentTimeline = { - type: "sentence" as TimelineEntryType, + type: "segment" as TimelineEntryType, text: segment.text, startTime: segment.start, endTime: segment.end, @@ -353,16 +278,6 @@ export const useTranscribe = () => { timeline.push(segmentTimeline); }); - } else if (res.words) { - const wordTimeline = res.words.map((word) => { - return { - type: "word" as TimelineEntryType, - text: word.word, - startTime: word.start, - endTime: word.end, - }; - }); - timeline = wordTimelineToSentenceTimeline(wordTimeline); } return { @@ -390,15 +305,16 @@ export const useTranscribe = () => { }) ).data; - const wordTimeline = res.words.map((word) => { + const caption = await parseText(res.vtt, { type: "vtt" }); + const timeline: Timeline = caption.cues.map((cue) => { return { - type: "word" as TimelineEntryType, - text: word.word, - startTime: word.start, - endTime: word.end, + type: "segment", + text: cue.text, + startTime: cue.startTime, + endTime: cue.endTime, + timeline: [], }; }); - const timeline = wordTimelineToSentenceTimeline(wordTimeline); return { engine: "cloudflare", @@ -435,7 +351,13 @@ export const useTranscribe = () => { let results: SpeechRecognitionResultType[] = []; - return new Promise((resolve, reject) => { + const res: { + engine: string; + model: string; + text: string; + tokenId: number; + timeline?: TimelineEntry[]; + } = await new Promise((resolve, reject) => { reco.recognizing = (_s, e) => { setOutput(e.result.text); }; @@ -454,44 +376,41 @@ export const useTranscribe = () => { } reco.stopContinuousRecognitionAsync(); + console.log("CANCELED: Reason=" + e.reason); }; - reco.sessionStopped = (_s, _e) => { + reco.sessionStopped = async (_s, e) => { + console.log( + "Session stopped. Stop continuous recognition.", + e.sessionId, + results + ); reco.stopContinuousRecognitionAsync(); - const wordTimeline: TimelineEntry[] = []; + const transcript = results + .map((result) => result.DisplayText) + .join(" ") + .trim(); + + const timeline: Timeline = []; results.forEach((result) => { const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0]; - const splitedWords = best.Display.trim().split(" "); + const firstWord = best.Words[0]; + const lastWord = best.Words[best.Words.length - 1]; - best.Words.forEach((word, index) => { - let text = word.Word; - if (splitedWords.length === best.Words.length) { - text = splitedWords[index]; - } - - if ( - index === best.Words.length - 1 && - !text.trim().match(sentenceEndPattern) - ) { - text = text + "."; - } - - wordTimeline.push({ - type: "word" as TimelineEntryType, - text, - startTime: word.Offset / 10000000.0, - endTime: (word.Offset + word.Duration) / 10000000.0, - }); + timeline.push({ + type: "sentence", + text: best.Display, + startTime: firstWord.Offset / 10000000.0, + endTime: (lastWord.Offset + lastWord.Duration) / 10000000.0, + timeline: [], }); }); - const timeline = wordTimelineToSentenceTimeline(wordTimeline); - resolve({ engine: "azure", model: "whisper", - text: results.map((result) => result.DisplayText).join(" "), + text: transcript, timeline, tokenId: id, }); @@ -499,6 +418,8 @@ export const useTranscribe = () => { reco.startContinuousRecognitionAsync(); }); + + return res; }; return { diff --git a/enjoy/src/renderer/hooks/use-transcriptions.tsx b/enjoy/src/renderer/hooks/use-transcriptions.tsx index 61095acc..41bbc4c1 100644 --- a/enjoy/src/renderer/hooks/use-transcriptions.tsx +++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx @@ -11,7 +11,7 @@ import { MAGIC_TOKEN_REGEX, END_OF_SENTENCE_REGEX } from "@/constants"; export const useTranscriptions = (media: AudioType | VideoType) => { const { whisperConfig } = useContext(AISettingsProviderContext); - const { EnjoyApp, webApi, learningLanguage } = useContext( + const { EnjoyApp, learningLanguage } = useContext( AppSettingsProviderContext ); const { addDblistener, removeDbListener } = useContext(DbProviderContext); diff --git a/enjoy/src/types/enjoy-app.d.ts b/enjoy/src/types/enjoy-app.d.ts index afccd064..3b93f8bb 100644 --- a/enjoy/src/types/enjoy-app.d.ts +++ b/enjoy/src/types/enjoy-app.d.ts @@ -257,6 +257,11 @@ type EnjoyAppType = { timeline: Timeline, options?: any ) => Promise; + wordToSentenceTimeline: ( + wordTimeline: Timeline, + transcript: string, + language: string + ) => Promise; transcode: (input: string) => Promise; check: () => Promise; }; diff --git a/enjoy/src/types/index.d.ts b/enjoy/src/types/index.d.ts index 9d5e5fb8..d4dd696e 100644 --- a/enjoy/src/types/index.d.ts +++ b/enjoy/src/types/index.d.ts @@ -77,6 +77,7 @@ type WhisperOutputType = { type CfWhipserOutputType = { text: string; + vtt: string; words_count: number; words: { word: string; diff --git a/enjoy/vite.main.config.ts b/enjoy/vite.main.config.ts index b2249ee0..df684736 100644 --- a/enjoy/vite.main.config.ts +++ b/enjoy/vite.main.config.ts @@ -27,6 +27,7 @@ export default defineConfig((env) => { ...external, "echogarden/dist/api/API.js", "echogarden/dist/audio/AudioUtilities.js", + "echogarden/dist/utilities/Timeline.js", ], output: { strict: false,