diff --git a/enjoy/src/i18n/en.json b/enjoy/src/i18n/en.json index d42e7edc..627a3822 100644 --- a/enjoy/src/i18n/en.json +++ b/enjoy/src/i18n/en.json @@ -611,8 +611,11 @@ "assessing": "Assessing", "assessedSuccessfully": "Assessed successfully", "optinal": "Optional", - "uploadTranscriptFile": "Upload transcript file(.txt/.srt/.vtt)", + "uploadTranscriptFile": "Upload transcript file", + "uploadTranscriptFileDescription": "Optional. Support formats: txt/srt/vtt.", "onlyTextFileIsSupported": "Only text file is supported", + "isolateVoice": "Isolate voice", + "isolateVoiceDescription": "Isolates voice from any music or background ambience. More accurate but slower", "sortBy": "Sort by", "createdAtDesc": "Created at desc", "createdAtAsc": "Created at asc", @@ -627,5 +630,6 @@ "search": "Search", "noData": "No data", "selectedFiles": "Selected files", - "moreOptions": "More options" + "moreOptions": "More options", + "lessOptions": "Less options" } diff --git a/enjoy/src/i18n/zh-CN.json b/enjoy/src/i18n/zh-CN.json index 549da3c6..adb6a9c0 100644 --- a/enjoy/src/i18n/zh-CN.json +++ b/enjoy/src/i18n/zh-CN.json @@ -611,8 +611,11 @@ "assessing": "正在评估", "assessedSuccessfully": "评估成功", "optinal": "可选", - "uploadTranscriptFile": "上传字幕文件(.txt/.srt/.vtt)", + "uploadTranscriptFile": "上传字幕文件", + "uploadTranscriptFileDescription": "可选。支持字幕文件格式: txt/srt/vtt。", "onlyTextFileIsSupported": "仅支持文本文件", + "isolateVoice": "提取人声", + "isolateVoiceDescription": "将人声从音乐、背景音中隔离,字幕对齐会更准确,但耗时较久。", "sortBy": "排序", "createdAtDesc": "创建时间降序", "createdAtAsc": "创建时间升序", @@ -627,5 +630,6 @@ "search": "搜索", "noData": "没有数据", "selectedFiles": "已选中文件", - "moreOptions": "更多选项" + "moreOptions": "更多选项", + "lessOptions": "更少选项" } diff --git a/enjoy/src/main/window.ts b/enjoy/src/main/window.ts index 59fb88da..785d30f0 100644 --- a/enjoy/src/main/window.ts +++ b/enjoy/src/main/window.ts @@ -431,6 +431,28 @@ ${log} return { action: "allow" }; }); + // Capture stderr & stdout and send them to renderer + const originalStderrWrite = process.stderr.write.bind(process.stderr); + process.stderr.write = (chunk, encoding?, callback?) => { + // Remove ANSI color codes + const output = chunk + .toString() + .replace(/\x1B\[([0-9]{1,3}(;[0-9]{1,2};?)?)?[mGK]/g, ""); + mainWindow.webContents.send("app-on-cmd-output", output); + + return originalStderrWrite(chunk, encoding, callback); + }; + const originalStdoutWrite = process.stdout.write.bind(process.stdout); + process.stdout.write = (chunk, encoding?, callback?) => { + // Remove ANSI color codes + const output = chunk + .toString() + .replace(/\x1B\[([0-9]{1,3}(;[0-9]{1,2};?)?)?[mGK]/g, ""); + mainWindow.webContents.send("app-on-cmd-output", output); + + return originalStdoutWrite(chunk, encoding, callback); + }; + // and load the index.html of the app. if (MAIN_WINDOW_VITE_DEV_SERVER_URL) { mainWindow.loadURL(MAIN_WINDOW_VITE_DEV_SERVER_URL); diff --git a/enjoy/src/preload.ts b/enjoy/src/preload.ts index 7366a292..0a2047e4 100644 --- a/enjoy/src/preload.ts +++ b/enjoy/src/preload.ts @@ -2,6 +2,8 @@ // https://www.electronjs.org/docs/latest/tutorial/process-model#preload-scripts import { contextBridge, ipcRenderer, IpcRendererEvent } from "electron"; import { version } from "../package.json"; +import { callback } from "chart.js/dist/helpers/helpers.core"; +import { remove } from "lodash"; contextBridge.exposeInMainWorld("__ENJOY_APP__", { app: { @@ -35,6 +37,12 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", { createIssue: (title: string, body: string) => { return ipcRenderer.invoke("app-create-issue", title, body); }, + onCmdOutput: (callback: (event: IpcRendererEvent, data: string) => void) => { + ipcRenderer.on("app-on-cmd-output", callback); + }, + removeCmdOutputListeners: () => { + ipcRenderer.removeAllListeners("app-on-cmd-output"); + }, version, }, window: { diff --git a/enjoy/src/renderer/components/medias/media-current-recording.tsx b/enjoy/src/renderer/components/medias/media-current-recording.tsx index 917ea7d8..50433947 100644 --- a/enjoy/src/renderer/components/medias/media-current-recording.tsx +++ b/enjoy/src/renderer/components/medias/media-current-recording.tsx @@ -64,7 +64,7 @@ export const MediaCurrentRecording = () => { currentTime: mediaCurrentTime, } = useContext(MediaPlayerProviderContext); const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext); - const { enabled, currentHotkeys } = useContext( + const { currentHotkeys } = useContext( HotKeysSettingsProviderContext ); const [player, setPlayer] = useState(null); diff --git a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx index 01310c6c..7c3bd3e6 100644 --- a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx +++ b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx @@ -9,6 +9,7 @@ import { AlertDialogContent, AlertDialogTitle, AlertDialogDescription, + toast, } from "@renderer/components/ui"; import { LoaderIcon } from "lucide-react"; import { TranscriptionCreateForm } from "../transcriptions"; @@ -22,6 +23,7 @@ export const MediaTranscriptionGenerateButton = (props: { transcribing, transcription, transcribingProgress, + transcribingOutput, } = useContext(MediaPlayerProviderContext); const [open, setOpen] = useState(false); @@ -62,11 +64,18 @@ export const MediaTranscriptionGenerateButton = (props: { originalText: data.text, language: data.language, service: data.service as WhisperConfigType["service"], - }); - setOpen(false); + isolate: data.isolate, + }) + .then(() => { + setOpen(false); + }) + .catch((e) => { + toast.error(e.message); + }); }} transcribing={transcribing} transcribingProgress={transcribingProgress} + transcribingOutput={transcribingOutput} /> diff --git a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx index fabcfa22..80ff3b90 100644 --- a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx +++ b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx @@ -12,6 +12,7 @@ import { CollapsibleContent, CollapsibleTrigger, Form, + FormDescription, FormField, FormItem, FormLabel, @@ -24,6 +25,7 @@ import { SelectItem, SelectTrigger, SelectValue, + Switch, Textarea, toast, } from "@renderer/components/ui"; @@ -36,18 +38,21 @@ const transcriptionSchema = z.object({ language: z.string(), service: z.string(), text: z.string().optional(), + isolate: z.boolean().optional(), }); export const TranscriptionCreateForm = (props: { onSubmit: (data: z.infer) => void; originalText?: string; onCancel?: () => void; - transcribing?: boolean; - transcribingProgress?: number; + transcribing: boolean; + transcribingProgress: number; + transcribingOutput: string; }) => { const { transcribing = false, transcribingProgress = 0, + transcribingOutput, onSubmit, onCancel, originalText, @@ -62,6 +67,7 @@ export const TranscriptionCreateForm = (props: { language: learningLanguage, service: whisperConfig.service, text: originalText, + isolate: false, }, }); @@ -127,7 +133,7 @@ export const TranscriptionCreateForm = (props: { control={form.control} name="service" render={({ field }) => ( - + {t("sttAiService")} - + ( - + {t("uploadTranscriptFile")}({t("optinal")}) @@ -205,6 +211,9 @@ export const TranscriptionCreateForm = (props: { } }} /> + + {t("uploadTranscriptFileDescription")} + {field.value != undefined && ( <> {t("transcript")} @@ -219,45 +228,92 @@ export const TranscriptionCreateForm = (props: { )} /> + ( + + {t("isolateVoice")} + + + {t("isolateVoiceDescription")} + + + )} + /> -
+
- {transcribing && form.watch("service") === "local" && ( -
-
- - {t("transcribing")} -
- {whisperConfig.service === "local" && ( - - )} -
- )} +
- {onCancel && ( + {onCancel && !transcribing && ( )}
); }; + +const TranscribeProgress = (props: { + service: string; + transcribing: boolean; + transcribingProgress: number; + transcribingOutput?: string; +}) => { + const { service, transcribing, transcribingProgress, transcribingOutput } = + props; + if (!transcribing) return null; + + return ( +
+
+ + {t("transcribing")} +
+ {service === "local" && transcribingProgress > 0 && ( + + )} + {transcribingOutput && ( +
+ + {transcribingOutput} + +
+ )} +
+ ); +}; diff --git a/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx b/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx index b1f2774f..c6b32241 100644 --- a/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx +++ b/enjoy/src/renderer/components/transcriptions/transcription-edit-button.tsx @@ -39,14 +39,12 @@ export const TranscriptionEditButton = (props: { const handleSave = async () => { setSubmiting(true); - try { - await generateTranscription({ originalText: content }); - setOpen(false); - } catch (e) { - toast.error(e.message); - } - - setSubmiting(false); + generateTranscription({ originalText: content }) + .then(() => setOpen(false)) + .catch((e) => { + toast.error(e.message); + }) + .finally(() => setSubmiting(false)); }; return ( diff --git a/enjoy/src/renderer/context/media-player-provider.tsx b/enjoy/src/renderer/context/media-player-provider.tsx index ddef181c..5c050db7 100644 --- a/enjoy/src/renderer/context/media-player-provider.tsx +++ b/enjoy/src/renderer/context/media-player-provider.tsx @@ -69,9 +69,11 @@ type MediaPlayerContextType = { originalText?: string; language?: string; service?: WhisperConfigType["service"]; - }) => void; + isolate?: boolean; + }) => Promise; transcribing: boolean; transcribingProgress: number; + transcribingOutput: string; transcriptionDraft: TranscriptionType["result"]; setTranscriptionDraft: (result: TranscriptionType["result"]) => void; // Recordings @@ -172,6 +174,7 @@ export const MediaPlayerProvider = ({ generateTranscription, transcribing, transcribingProgress, + transcribingOutput, abortGenerateTranscription, } = useTranscriptions(media); @@ -611,6 +614,7 @@ export const MediaPlayerProvider = ({ generateTranscription, transcribing, transcribingProgress, + transcribingOutput, transcriptionDraft, setTranscriptionDraft, isRecording, diff --git a/enjoy/src/renderer/hooks/use-transcribe.tsx b/enjoy/src/renderer/hooks/use-transcribe.tsx index 79d9e515..ad4f9ad7 100644 --- a/enjoy/src/renderer/hooks/use-transcribe.tsx +++ b/enjoy/src/renderer/hooks/use-transcribe.tsx @@ -3,7 +3,7 @@ import { AISettingsProviderContext, } from "@renderer/context"; import OpenAI from "openai"; -import { useContext } from "react"; +import { useContext, useState } from "react"; import { t } from "i18next"; import { AI_WORKER_ENDPOINT } from "@/constants"; import * as sdk from "microsoft-cognitiveservices-speech-sdk"; @@ -15,6 +15,7 @@ export const useTranscribe = () => { const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext); const { openai } = useContext(AISettingsProviderContext); const { punctuateText } = useAiCommand(); + const [output, setOutput] = useState(""); const transcode = async (src: string | Blob): Promise => { if (src instanceof Blob) { @@ -36,6 +37,7 @@ export const useTranscribe = () => { originalText?: string; language: string; service: WhisperConfigType["service"]; + isolate?: boolean; } ): Promise<{ engine: string; @@ -45,8 +47,14 @@ export const useTranscribe = () => { tokenId?: number; }> => { const url = await transcode(mediaSrc); - const { targetId, targetType, originalText, language, service } = - params || {}; + const { + targetId, + targetType, + originalText, + language, + service, + isolate = false, + } = params || {}; const blob = await (await fetch(url)).blob(); let result; @@ -70,6 +78,8 @@ export const useTranscribe = () => { throw new Error(t("whisperServiceNotSupported")); } + setOutput(null); + let transcript = originalText || result.text; // Remove all content inside `()`, `[]`, `{}` and trim the text @@ -93,6 +103,7 @@ export const useTranscribe = () => { transcript, { language, + isolate, } ); @@ -193,7 +204,8 @@ export const useTranscribe = () => { return new Promise((resolve, reject) => { reco.recognizing = (_s, e) => { - console.log(e.result.text); + console.log(e.result); + setOutput(e.result.text); }; reco.recognized = (_s, e) => { @@ -230,5 +242,6 @@ export const useTranscribe = () => { return { transcode, transcribe, + output, }; }; diff --git a/enjoy/src/renderer/hooks/use-transcriptions.tsx b/enjoy/src/renderer/hooks/use-transcriptions.tsx index b5db427d..1d172703 100644 --- a/enjoy/src/renderer/hooks/use-transcriptions.tsx +++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx @@ -16,11 +16,17 @@ export const useTranscriptions = (media: AudioType | VideoType) => { ); const { addDblistener, removeDbListener } = useContext(DbProviderContext); const [transcription, setTranscription] = useState(null); - const { transcribe } = useTranscribe(); + const { transcribe, output } = useTranscribe(); const [transcribingProgress, setTranscribingProgress] = useState(0); const [transcribing, setTranscribing] = useState(false); + const [transcribingOutput, setTranscribingOutput] = useState(""); + const [service, setService] = useState( + whisperConfig.service + ); const onTransactionUpdate = (event: CustomEvent) => { + if (!transcription) return; + const { model, action, record } = event.detail || {}; if ( model === "Transcription" && @@ -58,12 +64,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => { originalText?: string; language?: string; service?: WhisperConfigType["service"]; + isolate?: boolean; }) => { let { originalText, language = learningLanguage, service = whisperConfig.service, + isolate = false, } = params || {}; + setService(service); + if (originalText === undefined) { if (transcription?.targetId === media.id) { originalText = transcription.result?.originalText; @@ -77,131 +87,135 @@ export const useTranscriptions = (media: AudioType | VideoType) => { setTranscribing(true); setTranscribingProgress(0); - try { - const { engine, model, alignmentResult, tokenId } = await transcribe( - media.src, - { - targetId: media.id, - targetType: media.mediaType, - originalText, - language, - service, - } - ); - - let timeline: TimelineEntry[] = []; - alignmentResult.timeline.forEach((t) => { - if (t.type === "sentence") { - timeline.push(t); - } else { - t.timeline.forEach((st) => { - timeline.push(st); - }); - } - }); - - /* - * Pre-process - * 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc - * 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display; - * 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display; - */ - try { - timeline.forEach((sentence, i) => { - const nextSentence = timeline[i + 1]; - if ( - !sentence.text - .replaceAll(MAGIC_TOKEN_REGEX, "") - .match(END_OF_SENTENCE_REGEX) && - nextSentence?.text - ) { - nextSentence.text = [sentence.text, nextSentence.text].join(" "); - nextSentence.timeline = [ - ...sentence.timeline, - ...nextSentence.timeline, - ]; - nextSentence.startTime = sentence.startTime; - timeline.splice(i, 1); - } else { - const words = sentence.text.split(" "); - - sentence.timeline.forEach((token, j) => { - const word = words[j]?.trim()?.toLowerCase(); - - const match = word?.match(/-|%/); - if (!match) return; - - if ( - word === "-" && - token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase() - ) { - sentence.timeline.splice(j, 0, { - type: "token", - text: "-", - startTime: sentence.timeline[j - 1]?.endTime || 0, - endTime: sentence.timeline[j - 1]?.endTime || 0, - timeline: [], - }); - return; - } - - for (let k = j + 1; k <= sentence.timeline.length - 1; k++) { - if (word.includes(sentence.timeline[k].text.toLowerCase())) { - let connector = ""; - if (match[0] === "-") { - connector = "-"; - } - token.text = [token.text, sentence.timeline[k].text].join( - connector - ); - token.timeline = [ - ...token.timeline, - ...sentence.timeline[k].timeline, - ]; - token.endTime = sentence.timeline[k].endTime; - sentence.timeline.splice(k, 1); - } else { - break; - } - } - }); - } - }); - } catch (err) { - console.error(err); - } - - await EnjoyApp.transcriptions.update(transcription.id, { - state: "finished", - result: { - timeline: timeline, - transcript: alignmentResult.transcript, - originalText, - tokenId, - }, - engine, - model, + const { engine, model, alignmentResult, tokenId } = await transcribe( + media.src, + { + targetId: media.id, + targetType: media.mediaType, + originalText, language, - }); - - if (media.language !== language) { - if (media.mediaType === "Video") { - await EnjoyApp.videos.update(media.id, { - language, - }); - } else { - await EnjoyApp.audios.update(media.id, { - language, - }); - } + service, + isolate, + } + ); + + let timeline: TimelineEntry[] = []; + alignmentResult.timeline.forEach((t) => { + if (t.type === "sentence") { + timeline.push(t); + } else { + t.timeline.forEach((st) => { + timeline.push(st); + }); + } + }); + + timeline = preProcessTranscription(timeline); + if (media.language !== language) { + if (media.mediaType === "Video") { + await EnjoyApp.videos.update(media.id, { + language, + }); + } else { + await EnjoyApp.audios.update(media.id, { + language, + }); } - } catch (err) { - toast.error(err.message); } + await EnjoyApp.transcriptions.update(transcription.id, { + state: "finished", + result: { + timeline: timeline, + transcript: alignmentResult.transcript, + originalText, + tokenId, + }, + engine, + model, + language, + }); + setTranscribing(false); }; + const preProcessTranscription = (timeline: TimelineEntry[]) => { + /* + * Pre-process + * 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc + * 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display; + * 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display; + */ + try { + timeline.forEach((sentence, i) => { + const nextSentence = timeline[i + 1]; + if ( + !sentence.text + .replaceAll(MAGIC_TOKEN_REGEX, "") + .match(END_OF_SENTENCE_REGEX) && + nextSentence?.text + ) { + nextSentence.text = [sentence.text, nextSentence.text].join(" "); + nextSentence.timeline = [ + ...sentence.timeline, + ...nextSentence.timeline, + ]; + nextSentence.startTime = sentence.startTime; + timeline.splice(i, 1); + } else { + const words = sentence.text.split(" "); + + sentence.timeline.forEach((token, j) => { + const word = words[j]?.trim()?.toLowerCase(); + + const match = word?.match(/-|%/); + if (!match) return; + + if ( + word === "-" && + token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase() + ) { + sentence.timeline.splice(j, 0, { + type: "token", + text: "-", + startTime: sentence.timeline[j - 1]?.endTime || 0, + endTime: sentence.timeline[j - 1]?.endTime || 0, + timeline: [], + }); + return; + } + + for (let k = j + 1; k <= sentence.timeline.length - 1; k++) { + if (word.includes(sentence.timeline[k].text.toLowerCase())) { + let connector = ""; + if (match[0] === "-") { + connector = "-"; + } + token.text = [token.text, sentence.timeline[k].text].join( + connector + ); + token.timeline = [ + ...token.timeline, + ...sentence.timeline[k].timeline, + ]; + token.endTime = sentence.timeline[k].endTime; + sentence.timeline.splice(k, 1); + } else { + break; + } + } + }); + } + }); + } catch (err) { + console.warn(err); + toast.warning( + `Failed to pre-process transcription timeline: ${err.message}` + ); + } + return timeline; + }; + const findTranscriptionFromWebApi = async () => { if (!transcription) { await findOrCreateTranscription(); @@ -252,32 +266,40 @@ export const useTranscriptions = (media: AudioType | VideoType) => { }, [media]); /* - * auto-generate transcription result + * listen to transcription update */ useEffect(() => { if (!transcription) return; addDblistener(onTransactionUpdate); + return () => { + removeDbListener(onTransactionUpdate); + }; + }, [transcription]); - // if ( - // transcription.state == "pending" || - // !transcription.result?.["timeline"] - // ) { - // findOrGenerateTranscription(); - // } + /* + * listen to transcribe progress + */ + useEffect(() => { + if (!transcribing) return; - if (whisperConfig.service === "local") { + if (service === "local") { EnjoyApp.whisper.onProgress((_, p: number) => { if (p > 100) p = 100; setTranscribingProgress(p); }); } + EnjoyApp.app.onCmdOutput((_, output) => { + setTranscribingOutput(output); + }); + return () => { - removeDbListener(onTransactionUpdate); EnjoyApp.whisper.removeProgressListeners(); + EnjoyApp.app.removeCmdOutputListeners(); + setTranscribingOutput(null); }; - }, [transcription, media]); + }, [media, service, transcribing]); const abortGenerateTranscription = () => { EnjoyApp.whisper.abort(); @@ -288,6 +310,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => { transcription, transcribingProgress, transcribing, + transcribingOutput: output || transcribingOutput, generateTranscription, abortGenerateTranscription, }; diff --git a/enjoy/src/types/enjoy-app.d.ts b/enjoy/src/types/enjoy-app.d.ts index 07954ca6..f23ff54b 100644 --- a/enjoy/src/types/enjoy-app.d.ts +++ b/enjoy/src/types/enjoy-app.d.ts @@ -10,6 +10,8 @@ type EnjoyAppType = { quit: () => Promise; openDevTools: () => Promise; createIssue: (title: string, body: string) => Promise; + onCmdOutput: (callback: (event, output: string) => void) => void; + removeCmdOutputListeners: () => void; version: string; }; window: {