diff --git a/enjoy/src/i18n/en.json b/enjoy/src/i18n/en.json index 3a2af169..efccecee 100644 --- a/enjoy/src/i18n/en.json +++ b/enjoy/src/i18n/en.json @@ -242,6 +242,7 @@ "downloadFfmpeg": "Download FFmpeg", "youAreReadyToGo": "You are ready to go", "welcomeBack": "Welcome back! {{name}}", + "print": "Print", "download": "Download", "downloading": "Downloading {{file}}", "downloadedSuccessfully": "Downloaded successfully", @@ -374,6 +375,7 @@ "cloudflareAi": "Cloudflare AI", "cloudflareSpeechToTextDescription": "Use Cloudflare AI Worker to transcribe. It is in beta and free for now.", "openaiSpeechToTextDescription": "Use openAI to transcribe using your own key.", + "uploadSpeechToTextDescription": "Upload transcript file or input transcript text to align.", "checkingWhisper": "Checking whisper status", "pleaseDownloadWhisperModelFirst": "Please download whisper model first", "whisperIsWorkingGood": "Whisper is working good", @@ -618,7 +620,8 @@ "assessedSuccessfully": "Assessed successfully", "optinal": "Optional", "uploadTranscriptFile": "Upload transcript file", - "uploadTranscriptFileDescription": "Optional. Support formats: txt/srt/vtt.", + "uploadTranscriptFileDescription": "Support formats: txt/srt/vtt.", + "pleaseUploadTranscriptFile": "Please upload transcript file", "onlyTextFileIsSupported": "Only text file is supported", "isolateVoice": "Isolate voice(Experimental)", "isolateVoiceDescription": "Isolates voice from any music or background ambience. More accurate but slower", diff --git a/enjoy/src/i18n/zh-CN.json b/enjoy/src/i18n/zh-CN.json index a8220b4f..fd98891d 100644 --- a/enjoy/src/i18n/zh-CN.json +++ b/enjoy/src/i18n/zh-CN.json @@ -242,6 +242,7 @@ "downloadFfmpeg": "下载 FFmpeg", "youAreReadyToGo": "您已准备就绪", "welcomeBack": "欢迎回来, {{name}}", + "print": "打印", "download": "下载", "downloading": "正在下载 {{file}}", "downloadedSuccessfully": "下载成功", @@ -374,6 +375,7 @@ "cloudflareAi": "Cloudflare AI", "cloudflareSpeechToTextDescription": "使用 Cloudflare AI 进行语音转文本,目前免费", "openaiSpeechToTextDescription": "使用 OpenAI 进行语音转文本(需要 API 密钥)", + "uploadSpeechToTextDescription": "上传字幕文件或者输入文本进行字幕对齐", "checkingWhisper": "正在检查 Whisper", "pleaseDownloadWhisperModelFirst": "请先下载 Whisper 模型", "whisperIsWorkingGood": "Whisper 正常工作", @@ -618,7 +620,8 @@ "assessedSuccessfully": "评估成功", "optinal": "可选", "uploadTranscriptFile": "上传字幕文件", - "uploadTranscriptFileDescription": "可选。支持字幕文件格式: txt/srt/vtt。", + "uploadTranscriptFileDescription": "支持字幕文件格式: txt/srt/vtt。", + "pleaseUploadTranscriptFile": "请上传字幕文件", "onlyTextFileIsSupported": "仅支持文本文件", "isolateVoice": "提取人声(实验性)", "isolateVoiceDescription": "将人声从音乐、背景音中隔离,字幕对齐会更准确,但耗时较久。", diff --git a/enjoy/src/main/echogarden.ts b/enjoy/src/main/echogarden.ts index ae8de742..217ac5f4 100644 --- a/enjoy/src/main/echogarden.ts +++ b/enjoy/src/main/echogarden.ts @@ -1,7 +1,6 @@ import { ipcMain } from "electron"; import * as Echogarden from "echogarden/dist/api/API.js"; import { AlignmentOptions } from "echogarden/dist/api/API"; -import { AudioSourceParam } from "echogarden/dist/audio/AudioUtilities"; import { encodeRawAudioToWave, decodeWaveToRawAudio, @@ -9,7 +8,9 @@ import { getRawAudioDuration, trimAudioStart, trimAudioEnd, + AudioSourceParam, } from "echogarden/dist/audio/AudioUtilities.js"; +import { Timeline } from "echogarden/dist/utilities/Timeline.d.js"; import path from "path"; import log from "@main/logger"; import url from "url"; @@ -34,6 +35,7 @@ const __dirname = path const logger = log.scope("echogarden"); class EchogardenWrapper { public align: typeof Echogarden.align; + public alignSegments: typeof Echogarden.alignSegments; public denoise: typeof Echogarden.denoise; public encodeRawAudioToWave: typeof encodeRawAudioToWave; public decodeWaveToRawAudio: typeof decodeWaveToRawAudio; @@ -44,6 +46,7 @@ class EchogardenWrapper { constructor() { this.align = Echogarden.align; + this.alignSegments = Echogarden.alignSegments; this.denoise = Echogarden.denoise; this.encodeRawAudioToWave = encodeRawAudioToWave; this.decodeWaveToRawAudio = decodeWaveToRawAudio; @@ -110,6 +113,25 @@ class EchogardenWrapper { } ); + ipcMain.handle( + "echogarden-align-segments", + async ( + _event, + input: AudioSourceParam, + timeline: Timeline, + options: AlignmentOptions + ) => { + logger.debug("echogarden-align-segments:", timeline, options); + try { + const rawAudio = await this.ensureRawAudio(input, 16000); + return await this.alignSegments(rawAudio, timeline, options); + } catch (err) { + logger.error(err); + throw err; + } + } + ); + ipcMain.handle( "echogarden-transcode", async (_event, url: string, sampleRate?: number) => { diff --git a/enjoy/src/main/settings.ts b/enjoy/src/main/settings.ts index e5b8f21b..8cb2b3bb 100644 --- a/enjoy/src/main/settings.ts +++ b/enjoy/src/main/settings.ts @@ -94,7 +94,7 @@ const userDataPath = () => { const apiUrl = () => { const url: string = settings.getSync("apiUrl") as string; - return process.env.API_URL || url || WEB_API_URL; + return process.env.WEB_API_URL || url || WEB_API_URL; }; export default { diff --git a/enjoy/src/main/whisper.ts b/enjoy/src/main/whisper.ts index 0e00927d..287b257d 100644 --- a/enjoy/src/main/whisper.ts +++ b/enjoy/src/main/whisper.ts @@ -105,6 +105,8 @@ class Whipser { `--model "${model.savePath}"`, "--output-json", `--output-file "${path.join(tmpDir, "jfk")}"`, + `--split-on-word true`, + `--max-len 1`, ]; logger.debug(`Checking whisper command: ${commands.join(" ")}`); exec( @@ -203,6 +205,9 @@ class Whipser { "--print-progress", "--language", model.name.includes("en") ? "en" : language?.split("-")?.[0] || "auto", + `--split-on-word`, + `--max-len`, + "1", ...extra, ]; diff --git a/enjoy/src/preload.ts b/enjoy/src/preload.ts index d9b6900a..2d1789d5 100644 --- a/enjoy/src/preload.ts +++ b/enjoy/src/preload.ts @@ -2,6 +2,7 @@ // https://www.electronjs.org/docs/latest/tutorial/process-model#preload-scripts import { contextBridge, ipcRenderer, IpcRendererEvent } from "electron"; import { version } from "../package.json"; +import { Timeline } from "echogarden/dist/utilities/Timeline"; contextBridge.exposeInMainWorld("__ENJOY_APP__", { app: { @@ -439,6 +440,9 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", { align: (input: string, transcript: string, options: any) => { return ipcRenderer.invoke("echogarden-align", input, transcript, options); }, + alignSegments: (input: string, timeline: Timeline, options: any) => { + return ipcRenderer.invoke("echogarden-align-segments", input, timeline, options); + }, transcode: (input: string) => { return ipcRenderer.invoke("echogarden-transcode", input); }, diff --git a/enjoy/src/renderer/components/medias/index.ts b/enjoy/src/renderer/components/medias/index.ts index a99b6b46..f57e0d14 100644 --- a/enjoy/src/renderer/components/medias/index.ts +++ b/enjoy/src/renderer/components/medias/index.ts @@ -12,4 +12,4 @@ export * from "./media-provider"; export * from "./media-tabs"; export * from "./media-loading-modal"; export * from "./add-media-button"; -export * from "./media-transcription-download"; +export * from "./media-transcription-print"; diff --git a/enjoy/src/renderer/components/medias/media-caption.tsx b/enjoy/src/renderer/components/medias/media-caption.tsx index eb4a8d27..e5b3f60b 100644 --- a/enjoy/src/renderer/components/medias/media-caption.tsx +++ b/enjoy/src/renderer/components/medias/media-caption.tsx @@ -246,7 +246,6 @@ export const MediaCaption = () => { if (index < 0) return; if (index !== activeIndex) { - console.log("setActiveIndex", index); setActiveIndex(index); } }, [currentTime, caption]); @@ -509,8 +508,8 @@ export const Caption = (props: { let words = caption.text.split(" "); const ipas = caption.timeline.map((w) => - w.timeline.map((t) => - language.startsWith("en") + w.timeline?.map((t) => + t.timeline && language.startsWith("en") ? convertWordIpaToNormal( t.timeline.map((s) => s.text), { mappings: ipaMappings } diff --git a/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx b/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx index a0a90f8d..0582ea07 100644 --- a/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx +++ b/enjoy/src/renderer/components/medias/media-captions/tab-content-translation.tsx @@ -5,7 +5,7 @@ import { } from "@renderer/context"; import { TabsContent, Separator } from "@renderer/components/ui"; import { t } from "i18next"; -import { TimelineEntry } from "echogarden/dist/utilities/Timeline"; +import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js"; import { convertWordIpaToNormal } from "@/utils"; import { CamdictLookupResult, @@ -41,7 +41,9 @@ const SelectedWords = (props: { const { selectedIndices, caption } = props; const { transcription } = useContext(MediaPlayerProviderContext); - const { learningLanguage, ipaMappings } = useContext(AppSettingsProviderContext); + const { learningLanguage, ipaMappings } = useContext( + AppSettingsProviderContext + ); const word = selectedIndices .map((index) => caption.timeline[index]?.text || "") diff --git a/enjoy/src/renderer/components/medias/media-player-controls.tsx b/enjoy/src/renderer/components/medias/media-player-controls.tsx index d778f2c2..b387bd2c 100644 --- a/enjoy/src/renderer/components/medias/media-player-controls.tsx +++ b/enjoy/src/renderer/components/medias/media-player-controls.tsx @@ -34,7 +34,7 @@ import { useHotkeys } from "react-hotkeys-hook"; import cloneDeep from "lodash/cloneDeep"; import debounce from "lodash/debounce"; import { AlignmentResult } from "echogarden/dist/api/API.d.js"; -import { TimelineEntry } from "echogarden/dist/utilities/Timeline"; +import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js"; const PLAYBACK_RATE_OPTIONS = [0.75, 0.8, 0.9, 1.0]; export const MediaPlayerControls = () => { @@ -57,7 +57,7 @@ export const MediaPlayerControls = () => { setTranscriptionDraft, } = useContext(MediaPlayerProviderContext); const { EnjoyApp } = useContext(AppSettingsProviderContext); - const { currentHotkeys, enabled } = useContext( + const { currentHotkeys } = useContext( HotKeysSettingsProviderContext ); const [playMode, setPlayMode] = useState<"loop" | "single" | "all">("single"); diff --git a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx index 6d64262e..e17efd4d 100644 --- a/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx +++ b/enjoy/src/renderer/components/medias/media-transcription-generate-button.tsx @@ -76,7 +76,9 @@ export const MediaTranscriptionGenerateButton = (props: { generateTranscription({ originalText: data.text, language: data.language, - service: data.service as WhisperConfigType["service"], + service: data.service as + | WhisperConfigType["service"] + | "upload", isolate: data.isolate, }) .then(() => { diff --git a/enjoy/src/renderer/components/medias/media-transcription-download.tsx b/enjoy/src/renderer/components/medias/media-transcription-print.tsx similarity index 95% rename from enjoy/src/renderer/components/medias/media-transcription-download.tsx rename to enjoy/src/renderer/components/medias/media-transcription-print.tsx index 970ab998..ba959033 100644 --- a/enjoy/src/renderer/components/medias/media-transcription-download.tsx +++ b/enjoy/src/renderer/components/medias/media-transcription-print.tsx @@ -9,7 +9,7 @@ import { AlignmentResult } from "echogarden/dist/api/API.d.js"; import { convertWordIpaToNormal } from "@/utils"; import template from "./transcription.template.html?raw"; -export const MediaTranscriptionDownload = () => { +export const MediaTranscriptionPrint = () => { const { media, transcription } = useContext(MediaPlayerProviderContext); const { EnjoyApp, learningLanguage, ipaMappings } = useContext( AppSettingsProviderContext @@ -59,7 +59,7 @@ export const MediaTranscriptionDownload = () => { async function download() { try { const savePath = await EnjoyApp.dialog.showSaveDialog({ - title: t("download"), + title: t("print"), defaultPath: `${media.name}.pdf`, }); @@ -75,7 +75,7 @@ export const MediaTranscriptionDownload = () => { return ( ); }; diff --git a/enjoy/src/renderer/components/medias/media-transcription-read-button.tsx b/enjoy/src/renderer/components/medias/media-transcription-read-button.tsx index 92635c85..0807a702 100644 --- a/enjoy/src/renderer/components/medias/media-transcription-read-button.tsx +++ b/enjoy/src/renderer/components/medias/media-transcription-read-button.tsx @@ -28,7 +28,7 @@ import { SheetHeader, toast, } from "@renderer/components/ui"; -import { TimelineEntry } from "echogarden/dist/utilities/Timeline"; +import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js"; import { t } from "i18next"; import WaveSurfer from "wavesurfer.js"; import { diff --git a/enjoy/src/renderer/components/medias/media-transcription.tsx b/enjoy/src/renderer/components/medias/media-transcription.tsx index 3076420a..fa10dc5b 100644 --- a/enjoy/src/renderer/components/medias/media-transcription.tsx +++ b/enjoy/src/renderer/components/medias/media-transcription.tsx @@ -26,7 +26,7 @@ import { formatDuration } from "@renderer/lib/utils"; import { MediaTranscriptionReadButton, MediaTranscriptionGenerateButton, - MediaTranscriptionDownload, + MediaTranscriptionPrint, TranscriptionEditButton, } from "@renderer/components"; @@ -165,7 +165,7 @@ export const MediaTranscription = (props: { display?: boolean }) => { - + diff --git a/enjoy/src/renderer/components/notes/note-segment.tsx b/enjoy/src/renderer/components/notes/note-segment.tsx index 266bd266..f2f1eb9d 100644 --- a/enjoy/src/renderer/components/notes/note-segment.tsx +++ b/enjoy/src/renderer/components/notes/note-segment.tsx @@ -1,4 +1,4 @@ -import { TimelineEntry } from "echogarden/dist/utilities/Timeline"; +import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js"; import { useContext, useState } from "react"; import { WavesurferPlayer } from "@/renderer/components/misc"; import { AppSettingsProviderContext } from "@/renderer/context"; diff --git a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx index 80ff3b90..5beb3df3 100644 --- a/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx +++ b/enjoy/src/renderer/components/transcriptions/transcription-create-form.tsx @@ -3,14 +3,11 @@ import { AppSettingsProviderContext, } from "@renderer/context"; import { zodResolver } from "@hookform/resolvers/zod"; -import { useContext, useState } from "react"; +import { useContext } from "react"; import { useForm } from "react-hook-form"; import { z } from "zod"; import { Button, - Collapsible, - CollapsibleContent, - CollapsibleTrigger, Form, FormDescription, FormField, @@ -31,8 +28,9 @@ import { } from "@renderer/components/ui"; import { t } from "i18next"; import { LANGUAGES } from "@/constants"; -import { ChevronDownIcon, ChevronUpIcon, LoaderIcon } from "lucide-react"; +import { LoaderIcon } from "lucide-react"; import { parseText } from "media-captions"; +import { milisecondsToTimestamp } from "@/utils"; const transcriptionSchema = z.object({ language: z.string(), @@ -59,18 +57,28 @@ export const TranscriptionCreateForm = (props: { } = props; const { learningLanguage } = useContext(AppSettingsProviderContext); const { whisperConfig } = useContext(AISettingsProviderContext); - const [collapsibleOpen, setCollapsibleOpen] = useState(false); const form = useForm>({ resolver: zodResolver(transcriptionSchema), values: { language: learningLanguage, - service: whisperConfig.service, + service: originalText ? "upload" : whisperConfig.service, text: originalText, isolate: false, }, }); + const handleSubmit = (data: z.infer) => { + const { service, text } = data; + + if (service === "upload" && !text) { + toast.error(t("pleaseUploadTranscriptFile")); + return; + } + + onSubmit(data); + }; + const parseSubtitle = (file: File) => { const fileType = file.name.split(".").pop(); return new Promise((resolve, reject) => { @@ -88,7 +96,16 @@ export const TranscriptionCreateForm = (props: { if (caption.cues.length === 0) { text = cleanSubtitleText(text as string); } else { - text = caption.cues.map((cue) => cue.text).join("\n"); + // Write cues to text in SRT format + text = caption.cues + .map((cue, _) => { + return `${milisecondsToTimestamp( + cue.startTime * 1000 + )} --> ${milisecondsToTimestamp(cue.endTime * 1000)}\n${ + cue.text + }`; + }) + .join("\n\n"); } if (text.length === 0) { @@ -126,7 +143,7 @@ export const TranscriptionCreateForm = (props: { return (
OpenAI + {t("upload")} + + {form.watch("service") === "local" && + t("localSpeechToTextDescription")} + {form.watch("service") === "azure" && + t("azureSpeechToTextDescription")} + {form.watch("service") === "cloudflare" && + t("cloudflareSpeechToTextDescription")} + {form.watch("service") === "openai" && + t("openaiSpeechToTextDescription")} + {form.watch("service") === "upload" && + t("uploadSpeechToTextDescription")} + )} /> @@ -181,16 +211,14 @@ export const TranscriptionCreateForm = (props: { )} /> - - + {form.watch("service") === "upload" && ( + <> ( - - {t("uploadTranscriptFile")}({t("optinal")}) - + {t("uploadTranscriptFile")} )} /> - -
- - - -
-
+ + )} { - const [open, setOpen] = useState(false); - const [submiting, setSubmiting] = useState(false); - const { transcription, generateTranscription } = useContext( + const { media, transcription, generateTranscription } = useContext( MediaPlayerProviderContext ); + const [open, setOpen] = useState(false); + const [submiting, setSubmiting] = useState(false); const [content, setContent] = useState( - transcription.result.timeline.map((t: TimelineEntry) => t.text).join("\n\n") + // generate text in SRT format from timeline entries + transcription.result.timeline + .map( + (t: TimelineEntry) => + `${milisecondsToTimestamp( + t.startTime * 1000 + )} --> ${milisecondsToTimestamp(t.endTime * 1000)}\n${t.text}` + ) + .join("\n\n") ); + const [downloadUrl, setDownloadUrl] = useState(); const handleSave = async () => { setSubmiting(true); - generateTranscription({ originalText: content }) + generateTranscription({ originalText: content, service: "upload" }) .then(() => setOpen(false)) .catch((e) => { toast.error(e.message); @@ -47,6 +57,13 @@ export const TranscriptionEditButton = (props: { .finally(() => setSubmiting(false)); }; + useEffect(() => { + if (!content) return; + + const blob = new Blob([content], { type: "text/html" }); + setDownloadUrl(URL.createObjectURL(blob)); + }, [content]); + return ( @@ -76,6 +93,11 @@ export const TranscriptionEditButton = (props: { {t("cancel")} + + + + + diff --git a/enjoy/src/renderer/context/media-player-provider.tsx b/enjoy/src/renderer/context/media-player-provider.tsx index ce2fe5e9..12c7656a 100644 --- a/enjoy/src/renderer/context/media-player-provider.tsx +++ b/enjoy/src/renderer/context/media-player-provider.tsx @@ -68,7 +68,7 @@ type MediaPlayerContextType = { generateTranscription: (params?: { originalText?: string; language?: string; - service?: WhisperConfigType["service"]; + service?: WhisperConfigType["service"] | "upload"; isolate?: boolean; }) => Promise; transcribing: boolean; @@ -352,7 +352,7 @@ export const MediaPlayerProvider = ({ let phones: TimelineEntry[] = []; words.forEach((word: TimelineEntry) => { - word.timeline.forEach((token: TimelineEntry) => { + word.timeline?.forEach((token: TimelineEntry) => { phones = phones.concat(token.timeline); }); }); diff --git a/enjoy/src/renderer/hooks/use-transcribe.tsx b/enjoy/src/renderer/hooks/use-transcribe.tsx index 89c0f82e..d2db50cf 100644 --- a/enjoy/src/renderer/hooks/use-transcribe.tsx +++ b/enjoy/src/renderer/hooks/use-transcribe.tsx @@ -8,9 +8,92 @@ import { t } from "i18next"; import { AI_WORKER_ENDPOINT } from "@/constants"; import * as sdk from "microsoft-cognitiveservices-speech-sdk"; import axios from "axios"; -import { AlignmentResult } from "echogarden/dist/api/API.d.js"; import { useAiCommand } from "./use-ai-command"; import { toast } from "@renderer/components/ui"; +import { + Timeline, + TimelineEntry, + type TimelineEntryType, +} from "echogarden/dist/utilities/Timeline"; +import take from "lodash/take"; +import sortedUniqBy from "lodash/sortedUniqBy"; +import { parseText } from "media-captions"; + +/* + * define the regex pattern to match the end of a sentence + * the end of a sentence is defined as a period, question mark, or exclamation mark + * also it may be followed by a quotation mark + * and exclude sepecial cases like "Mr.", "Mrs.", "Dr.", "Ms.", "etc." + */ +const sentenceEndPattern = /(? { + const timeline: TimelineEntry[] = []; + + wordTimeline.forEach((word, index) => { + word.text = word.text.trim(); + // skip empty words + if (!word.text) return; + // skip music or sound effects quoted in [] + if (word.text.match(/^\[.*\]$/)) return; + + const wordEntry = { + type: "word" as TimelineEntryType, + text: word.text, + startTime: word.startTime, + endTime: word.endTime, + }; + + let sentence: TimelineEntry; + // get the last sentence in the timeline + if (timeline.length > 0) { + sentence = timeline[timeline.length - 1]; + } + + // if there is no sentence in the timeline, create a new sentence + // if last sentence is a punctuation, create a new sentence + if (!sentence || sentence.text.match(sentenceEndPattern)) { + sentence = { + type: "sentence" as TimelineEntryType, + text: "", + startTime: wordEntry.startTime, + endTime: wordEntry.endTime, + timeline: [], + }; + timeline.push(sentence); + } + + // if the word is a punctuation, add it to the sentence and start a new sentence + if (wordEntry.text.match(sentenceEndPattern)) { + sentence.text += wordEntry.text; + sentence.endTime = wordEntry.endTime; + + const lastSentence = timeline[timeline.length - 1]; + if (lastSentence.endTime !== sentence.endTime) { + timeline.push(sentence); + } + } else { + sentence.text += wordEntry.text + " "; + sentence.endTime = wordEntry.endTime; + + if (index === wordTimeline.length - 1) { + timeline.push(sentence); + } + } + }); + + return timeline; +}; export const useTranscribe = () => { const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext); @@ -37,13 +120,14 @@ export const useTranscribe = () => { targetType?: string; originalText?: string; language: string; - service: WhisperConfigType["service"]; + service: WhisperConfigType["service"] | "upload"; isolate?: boolean; } ): Promise<{ engine: string; model: string; - alignmentResult: AlignmentResult; + transcript: string; + timeline: TimelineEntry[]; originalText?: string; tokenId?: number; }> => { @@ -58,67 +142,152 @@ export const useTranscribe = () => { } = params || {}; const blob = await (await fetch(url)).blob(); - let result; - if (originalText) { - result = { - engine: "original", - model: "original", - }; + let result: any; + let timeline: Timeline = []; + if (service === "upload" && originalText) { + const caption = await parseText(originalText, { type: "srt" }); + if (caption.cues.length > 0) { + timeline = caption.cues.map((cue) => { + return { + type: "sentence", + text: cue.text, + startTime: cue.startTime, + endTime: cue.endTime, + timeline: [], + }; + }); + result = { + engine: "upload", + model: "-", + text: timeline.map((entry) => entry.text).join(" "), + timeline, + }; + } else { + result = { + engine: "upload", + model: "-", + text: originalText, + }; + } } else if (service === "local") { result = await transcribeByLocal(url, language); } else if (service === "cloudflare") { result = await transcribeByCloudflareAi(blob); } else if (service === "openai") { - result = await transcribeByOpenAi(blob); + result = await transcribeByOpenAi( + new File([blob], "audio.mp3", { type: "audio/mp3" }) + ); } else if (service === "azure") { - result = await transcribeByAzureAi(blob, language, { - targetId, - targetType, - }); + result = await transcribeByAzureAi( + new File([blob], "audio.wav", { type: "audio/wav" }), + language, + { + targetId, + targetType, + } + ); } else { throw new Error(t("whisperServiceNotSupported")); } + let transcript = result.text; - setOutput(null); + /* + * if timeline is available and the transcript contains punctuations + * use `alignSegments` to align each sentence with the timeline + * otherwise, use `align` to align the whole transcript + * if the transcript does not contain any punctuation, use AI command to add punctuation + */ + if (result.timeline?.length && transcript.match(punctuationsPattern)) { + timeline = [...result.timeline]; + setOutput("Aligning the transcript..."); + const wordTimeline = await EnjoyApp.echogarden.alignSegments( + new Uint8Array(await blob.arrayBuffer()), + timeline, + { + language, + isolate, + } + ); - let transcript = originalText || result.text; + wordTimeline.forEach((word: TimelineEntry) => { + let sentence = timeline.find( + (entry) => + word.startTime >= entry.startTime && word.endTime <= entry.endTime + ); - // Remove all content inside `()`, `[]`, `{}` and trim the text - // remove all markdown formatting - transcript = transcript - .replace(/\(.*?\)/g, "") - .replace(/\[.*?\]/g, "") - .replace(/\{.*?\}/g, "") - .replace(/[*_`]/g, "") - .trim(); + if (sentence) { + sentence.timeline.push(word); + } + }); - // if the transcript does not contain any punctuation, use AI command to add punctuation - if (!transcript.match(/\w[.,!?](\s|$)/)) { - try { - transcript = await punctuateText(transcript); - } catch (err) { - toast.error(err.message); - console.warn(err.message); + /* + * the start time of a sentence should be the start time of the first word in the sentence + * the end time of a sentence should the end time of the last word in the sentence + */ + // timeline.forEach((t) => { + // if (t.timeline.length === 0) return; + + // t.startTime = t.timeline[0].startTime; + // t.endTime = t.timeline[t.timeline.length - 1].endTime; + // }); + } else { + // Remove all content inside `()`, `[]`, `{}` and trim the text + // remove all markdown formatting + transcript = transcript + .replace(/\(.*?\)/g, "") + .replace(/\[.*?\]/g, "") + .replace(/\{.*?\}/g, "") + .replace(/[*_`]/g, "") + .trim(); + + // if the transcript does not contain any punctuation, use AI command to add punctuation + if (!transcript.match(punctuationsPattern)) { + try { + transcript = await punctuateText(transcript); + } catch (err) { + toast.error(err.message); + console.warn(err.message); + } } + + setOutput("Aligning the transcript..."); + const alignmentResult = await EnjoyApp.echogarden.align( + new Uint8Array(await blob.arrayBuffer()), + transcript, + { + language, + isolate, + } + ); + + alignmentResult.timeline.forEach((t: TimelineEntry) => { + if (t.type === "sentence") { + timeline.push(t); + } else { + t.timeline.forEach((st) => { + timeline.push(st); + }); + } + }); } - const alignmentResult = await EnjoyApp.echogarden.align( - new Uint8Array(await blob.arrayBuffer()), - transcript, - { - language, - isolate, - } - ); - return { ...result, originalText, - alignmentResult, + transcript, + timeline, }; }; - const transcribeByLocal = async (url: string, language?: string) => { + const transcribeByLocal = async ( + url: string, + language?: string + ): Promise<{ + engine: string; + model: string; + text: string; + timeline: TimelineEntry[]; + }> => { const res = await EnjoyApp.whisper.transcribe( { file: url, @@ -130,14 +299,25 @@ export const useTranscribe = () => { } ); + const wordTimeline: TimelineEntry[] = res.transcription.map((word) => { + return { + type: "word" as TimelineEntryType, + text: word.text, + startTime: word.offsets.from / 1000.0, + endTime: word.offsets.to / 1000.0, + }; + }); + const timeline = wordTimelineToSentenceTimeline(wordTimeline); + return { engine: "whisper", model: res.model.type, text: res.transcription.map((segment) => segment.text).join(" "), + timeline, }; }; - const transcribeByOpenAi = async (blob: Blob) => { + const transcribeByOpenAi = async (file: File) => { if (!openai?.key) { throw new Error(t("openaiKeyRequired")); } @@ -149,20 +329,58 @@ export const useTranscribe = () => { maxRetries: 0, }); - const res: { text: string } = (await client.audio.transcriptions.create({ - file: new File([blob], "audio.wav"), + const res: { + text: string; + words?: { word: string; start: number; end: number }[]; + segments?: { text: string; start: number; end: number }[]; + } = (await client.audio.transcriptions.create({ + file, model: "whisper-1", - response_format: "json", + response_format: "verbose_json", + timestamp_granularities: ["word"], })) as any; + let timeline: TimelineEntry[] = []; + if (res.segments) { + res.segments.forEach((segment) => { + const segmentTimeline = { + type: "sentence" as TimelineEntryType, + text: segment.text, + startTime: segment.start, + endTime: segment.end, + timeline: [] as Timeline, + }; + + timeline.push(segmentTimeline); + }); + } else if (res.words) { + const wordTimeline = res.words.map((word) => { + return { + type: "word" as TimelineEntryType, + text: word.word, + startTime: word.start, + endTime: word.end, + }; + }); + timeline = wordTimelineToSentenceTimeline(wordTimeline); + } + return { engine: "openai", model: "whisper-1", text: res.text, + timeline, }; }; - const transcribeByCloudflareAi = async (blob: Blob) => { + const transcribeByCloudflareAi = async ( + blob: Blob + ): Promise<{ + engine: string; + model: string; + text: string; + timeline?: TimelineEntry[]; + }> => { const res: CfWhipserOutputType = ( await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, blob, { headers: { @@ -172,15 +390,26 @@ export const useTranscribe = () => { }) ).data; + const wordTimeline = res.words.map((word) => { + return { + type: "word" as TimelineEntryType, + text: word.word, + startTime: word.start, + endTime: word.end, + }; + }); + const timeline = wordTimelineToSentenceTimeline(wordTimeline); + return { engine: "cloudflare", model: "@cf/openai/whisper", text: res.text, + timeline, }; }; const transcribeByAzureAi = async ( - blob: Blob, + file: File, language: string, params?: { targetId?: string; @@ -191,12 +420,11 @@ export const useTranscribe = () => { model: string; text: string; tokenId: number; + timeline?: TimelineEntry[]; }> => { const { id, token, region } = await webApi.generateSpeechToken(params); const config = sdk.SpeechConfig.fromAuthorizationToken(token, region); - const audioConfig = sdk.AudioConfig.fromWavFileInput( - new File([blob], "audio.wav") - ); + const audioConfig = sdk.AudioConfig.fromWavFileInput(file); // setting the recognition language to learning language, such as 'en-US'. config.speechRecognitionLanguage = language; config.requestWordLevelTimestamps(); @@ -209,7 +437,6 @@ export const useTranscribe = () => { return new Promise((resolve, reject) => { reco.recognizing = (_s, e) => { - console.log(e.result); setOutput(e.result.text); }; @@ -232,10 +459,40 @@ export const useTranscribe = () => { reco.sessionStopped = (_s, _e) => { reco.stopContinuousRecognitionAsync(); + const wordTimeline: TimelineEntry[] = []; + results.forEach((result) => { + const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0]; + const splitedWords = best.Display.trim().split(" "); + + best.Words.forEach((word, index) => { + let text = word.Word; + if (splitedWords.length === best.Words.length) { + text = splitedWords[index]; + } + + if ( + index === best.Words.length - 1 && + !text.trim().match(sentenceEndPattern) + ) { + text = text + "."; + } + + wordTimeline.push({ + type: "word" as TimelineEntryType, + text, + startTime: word.Offset / 10000000.0, + endTime: (word.Offset + word.Duration) / 10000000.0, + }); + }); + }); + + const timeline = wordTimelineToSentenceTimeline(wordTimeline); + resolve({ engine: "azure", model: "whisper", text: results.map((result) => result.DisplayText).join(" "), + timeline, tokenId: id, }); }; diff --git a/enjoy/src/renderer/hooks/use-transcriptions.tsx b/enjoy/src/renderer/hooks/use-transcriptions.tsx index 5522734f..ba72684b 100644 --- a/enjoy/src/renderer/hooks/use-transcriptions.tsx +++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx @@ -20,9 +20,9 @@ export const useTranscriptions = (media: AudioType | VideoType) => { const [transcribingProgress, setTranscribingProgress] = useState(0); const [transcribing, setTranscribing] = useState(false); const [transcribingOutput, setTranscribingOutput] = useState(""); - const [service, setService] = useState( - whisperConfig.service - ); + const [service, setService] = useState< + WhisperConfigType["service"] | "upload" + >(whisperConfig.service); const onTransactionUpdate = (event: CustomEvent) => { if (!transcription) return; @@ -63,7 +63,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => { const generateTranscription = async (params?: { originalText?: string; language?: string; - service?: WhisperConfigType["service"]; + service?: WhisperConfigType["service"] | "upload"; isolate?: boolean; }) => { let { @@ -87,7 +87,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => { } } } - const { engine, model, alignmentResult, tokenId } = await transcribe( + const { engine, model, transcript, timeline, tokenId } = await transcribe( media.src, { targetId: media.id, @@ -99,18 +99,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => { } ); - let timeline: TimelineEntry[] = []; - alignmentResult.timeline.forEach((t) => { - if (t.type === "sentence") { - timeline.push(t); - } else { - t.timeline.forEach((st) => { - timeline.push(st); - }); - } - }); - - timeline = preProcessTranscription(timeline); + const processedTimeline = preProcessTranscription(timeline); if (media.language !== language) { if (media.mediaType === "Video") { await EnjoyApp.videos.update(media.id, { @@ -126,8 +115,8 @@ export const useTranscriptions = (media: AudioType | VideoType) => { await EnjoyApp.transcriptions.update(transcription.id, { state: "finished", result: { - timeline: timeline, - transcript: alignmentResult.transcript, + timeline: processedTimeline, + transcript, originalText, tokenId, }, diff --git a/enjoy/src/types/enjoy-app.d.ts b/enjoy/src/types/enjoy-app.d.ts index 8d4467ad..afccd064 100644 --- a/enjoy/src/types/enjoy-app.d.ts +++ b/enjoy/src/types/enjoy-app.d.ts @@ -252,6 +252,11 @@ type EnjoyAppType = { transcript: string, options?: any ) => Promise; + alignSegments: ( + input: string | Uint8Array, + timeline: Timeline, + options?: any + ) => Promise; transcode: (input: string) => Promise; check: () => Promise; }; diff --git a/enjoy/src/utils.ts b/enjoy/src/utils.ts index 64ead8c2..dfac4d7d 100644 --- a/enjoy/src/utils.ts +++ b/enjoy/src/utils.ts @@ -49,7 +49,7 @@ export function milisecondsToTimestamp(ms: number) { const hours = Math.floor(ms / 3600000).toString(); const minutes = Math.floor((ms % 3600000) / 60000).toString(); const seconds = Math.floor(((ms % 360000) % 60000) / 1000).toString(); - const milliseconds = Math.floor(((ms % 360000) % 60000) % 1000).toString(); + const milliseconds = Math.round(((ms % 360000) % 60000) % 1000).toString(); return `${hours.padStart(2, "0")}:${minutes.padStart( 2, "0"