Feat: Improve alignment for the audio with background noise (#870)
* use echogarden.alignSegments * fix cloudflare whisper * refactor azure ai transcribe * refactor * fix align result * refactor * edit transcription in srt format * improve timeline * refactor * fix update current segment index * validate text when use upload transcript * add form description * refactor codes * do not change sentence timeline based on word timeline
This commit is contained in:
@@ -242,6 +242,7 @@
|
||||
"downloadFfmpeg": "Download FFmpeg",
|
||||
"youAreReadyToGo": "You are ready to go",
|
||||
"welcomeBack": "Welcome back! {{name}}",
|
||||
"print": "Print",
|
||||
"download": "Download",
|
||||
"downloading": "Downloading {{file}}",
|
||||
"downloadedSuccessfully": "Downloaded successfully",
|
||||
@@ -374,6 +375,7 @@
|
||||
"cloudflareAi": "Cloudflare AI",
|
||||
"cloudflareSpeechToTextDescription": "Use Cloudflare AI Worker to transcribe. It is in beta and free for now.",
|
||||
"openaiSpeechToTextDescription": "Use openAI to transcribe using your own key.",
|
||||
"uploadSpeechToTextDescription": "Upload transcript file or input transcript text to align.",
|
||||
"checkingWhisper": "Checking whisper status",
|
||||
"pleaseDownloadWhisperModelFirst": "Please download whisper model first",
|
||||
"whisperIsWorkingGood": "Whisper is working good",
|
||||
@@ -618,7 +620,8 @@
|
||||
"assessedSuccessfully": "Assessed successfully",
|
||||
"optinal": "Optional",
|
||||
"uploadTranscriptFile": "Upload transcript file",
|
||||
"uploadTranscriptFileDescription": "Optional. Support formats: txt/srt/vtt.",
|
||||
"uploadTranscriptFileDescription": "Support formats: txt/srt/vtt.",
|
||||
"pleaseUploadTranscriptFile": "Please upload transcript file",
|
||||
"onlyTextFileIsSupported": "Only text file is supported",
|
||||
"isolateVoice": "Isolate voice(Experimental)",
|
||||
"isolateVoiceDescription": "Isolates voice from any music or background ambience. More accurate but slower",
|
||||
|
||||
@@ -242,6 +242,7 @@
|
||||
"downloadFfmpeg": "下载 FFmpeg",
|
||||
"youAreReadyToGo": "您已准备就绪",
|
||||
"welcomeBack": "欢迎回来, {{name}}",
|
||||
"print": "打印",
|
||||
"download": "下载",
|
||||
"downloading": "正在下载 {{file}}",
|
||||
"downloadedSuccessfully": "下载成功",
|
||||
@@ -374,6 +375,7 @@
|
||||
"cloudflareAi": "Cloudflare AI",
|
||||
"cloudflareSpeechToTextDescription": "使用 Cloudflare AI 进行语音转文本,目前免费",
|
||||
"openaiSpeechToTextDescription": "使用 OpenAI 进行语音转文本(需要 API 密钥)",
|
||||
"uploadSpeechToTextDescription": "上传字幕文件或者输入文本进行字幕对齐",
|
||||
"checkingWhisper": "正在检查 Whisper",
|
||||
"pleaseDownloadWhisperModelFirst": "请先下载 Whisper 模型",
|
||||
"whisperIsWorkingGood": "Whisper 正常工作",
|
||||
@@ -618,7 +620,8 @@
|
||||
"assessedSuccessfully": "评估成功",
|
||||
"optinal": "可选",
|
||||
"uploadTranscriptFile": "上传字幕文件",
|
||||
"uploadTranscriptFileDescription": "可选。支持字幕文件格式: txt/srt/vtt。",
|
||||
"uploadTranscriptFileDescription": "支持字幕文件格式: txt/srt/vtt。",
|
||||
"pleaseUploadTranscriptFile": "请上传字幕文件",
|
||||
"onlyTextFileIsSupported": "仅支持文本文件",
|
||||
"isolateVoice": "提取人声(实验性)",
|
||||
"isolateVoiceDescription": "将人声从音乐、背景音中隔离,字幕对齐会更准确,但耗时较久。",
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
import { ipcMain } from "electron";
|
||||
import * as Echogarden from "echogarden/dist/api/API.js";
|
||||
import { AlignmentOptions } from "echogarden/dist/api/API";
|
||||
import { AudioSourceParam } from "echogarden/dist/audio/AudioUtilities";
|
||||
import {
|
||||
encodeRawAudioToWave,
|
||||
decodeWaveToRawAudio,
|
||||
@@ -9,7 +8,9 @@ import {
|
||||
getRawAudioDuration,
|
||||
trimAudioStart,
|
||||
trimAudioEnd,
|
||||
AudioSourceParam,
|
||||
} from "echogarden/dist/audio/AudioUtilities.js";
|
||||
import { Timeline } from "echogarden/dist/utilities/Timeline.d.js";
|
||||
import path from "path";
|
||||
import log from "@main/logger";
|
||||
import url from "url";
|
||||
@@ -34,6 +35,7 @@ const __dirname = path
|
||||
const logger = log.scope("echogarden");
|
||||
class EchogardenWrapper {
|
||||
public align: typeof Echogarden.align;
|
||||
public alignSegments: typeof Echogarden.alignSegments;
|
||||
public denoise: typeof Echogarden.denoise;
|
||||
public encodeRawAudioToWave: typeof encodeRawAudioToWave;
|
||||
public decodeWaveToRawAudio: typeof decodeWaveToRawAudio;
|
||||
@@ -44,6 +46,7 @@ class EchogardenWrapper {
|
||||
|
||||
constructor() {
|
||||
this.align = Echogarden.align;
|
||||
this.alignSegments = Echogarden.alignSegments;
|
||||
this.denoise = Echogarden.denoise;
|
||||
this.encodeRawAudioToWave = encodeRawAudioToWave;
|
||||
this.decodeWaveToRawAudio = decodeWaveToRawAudio;
|
||||
@@ -110,6 +113,25 @@ class EchogardenWrapper {
|
||||
}
|
||||
);
|
||||
|
||||
ipcMain.handle(
|
||||
"echogarden-align-segments",
|
||||
async (
|
||||
_event,
|
||||
input: AudioSourceParam,
|
||||
timeline: Timeline,
|
||||
options: AlignmentOptions
|
||||
) => {
|
||||
logger.debug("echogarden-align-segments:", timeline, options);
|
||||
try {
|
||||
const rawAudio = await this.ensureRawAudio(input, 16000);
|
||||
return await this.alignSegments(rawAudio, timeline, options);
|
||||
} catch (err) {
|
||||
logger.error(err);
|
||||
throw err;
|
||||
}
|
||||
}
|
||||
);
|
||||
|
||||
ipcMain.handle(
|
||||
"echogarden-transcode",
|
||||
async (_event, url: string, sampleRate?: number) => {
|
||||
|
||||
@@ -94,7 +94,7 @@ const userDataPath = () => {
|
||||
|
||||
const apiUrl = () => {
|
||||
const url: string = settings.getSync("apiUrl") as string;
|
||||
return process.env.API_URL || url || WEB_API_URL;
|
||||
return process.env.WEB_API_URL || url || WEB_API_URL;
|
||||
};
|
||||
|
||||
export default {
|
||||
|
||||
@@ -105,6 +105,8 @@ class Whipser {
|
||||
`--model "${model.savePath}"`,
|
||||
"--output-json",
|
||||
`--output-file "${path.join(tmpDir, "jfk")}"`,
|
||||
`--split-on-word true`,
|
||||
`--max-len 1`,
|
||||
];
|
||||
logger.debug(`Checking whisper command: ${commands.join(" ")}`);
|
||||
exec(
|
||||
@@ -203,6 +205,9 @@ class Whipser {
|
||||
"--print-progress",
|
||||
"--language",
|
||||
model.name.includes("en") ? "en" : language?.split("-")?.[0] || "auto",
|
||||
`--split-on-word`,
|
||||
`--max-len`,
|
||||
"1",
|
||||
...extra,
|
||||
];
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
// https://www.electronjs.org/docs/latest/tutorial/process-model#preload-scripts
|
||||
import { contextBridge, ipcRenderer, IpcRendererEvent } from "electron";
|
||||
import { version } from "../package.json";
|
||||
import { Timeline } from "echogarden/dist/utilities/Timeline";
|
||||
|
||||
contextBridge.exposeInMainWorld("__ENJOY_APP__", {
|
||||
app: {
|
||||
@@ -439,6 +440,9 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
|
||||
align: (input: string, transcript: string, options: any) => {
|
||||
return ipcRenderer.invoke("echogarden-align", input, transcript, options);
|
||||
},
|
||||
alignSegments: (input: string, timeline: Timeline, options: any) => {
|
||||
return ipcRenderer.invoke("echogarden-align-segments", input, timeline, options);
|
||||
},
|
||||
transcode: (input: string) => {
|
||||
return ipcRenderer.invoke("echogarden-transcode", input);
|
||||
},
|
||||
|
||||
@@ -12,4 +12,4 @@ export * from "./media-provider";
|
||||
export * from "./media-tabs";
|
||||
export * from "./media-loading-modal";
|
||||
export * from "./add-media-button";
|
||||
export * from "./media-transcription-download";
|
||||
export * from "./media-transcription-print";
|
||||
|
||||
@@ -246,7 +246,6 @@ export const MediaCaption = () => {
|
||||
|
||||
if (index < 0) return;
|
||||
if (index !== activeIndex) {
|
||||
console.log("setActiveIndex", index);
|
||||
setActiveIndex(index);
|
||||
}
|
||||
}, [currentTime, caption]);
|
||||
@@ -509,8 +508,8 @@ export const Caption = (props: {
|
||||
|
||||
let words = caption.text.split(" ");
|
||||
const ipas = caption.timeline.map((w) =>
|
||||
w.timeline.map((t) =>
|
||||
language.startsWith("en")
|
||||
w.timeline?.map((t) =>
|
||||
t.timeline && language.startsWith("en")
|
||||
? convertWordIpaToNormal(
|
||||
t.timeline.map((s) => s.text),
|
||||
{ mappings: ipaMappings }
|
||||
|
||||
@@ -5,7 +5,7 @@ import {
|
||||
} from "@renderer/context";
|
||||
import { TabsContent, Separator } from "@renderer/components/ui";
|
||||
import { t } from "i18next";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
|
||||
import { convertWordIpaToNormal } from "@/utils";
|
||||
import {
|
||||
CamdictLookupResult,
|
||||
@@ -41,7 +41,9 @@ const SelectedWords = (props: {
|
||||
const { selectedIndices, caption } = props;
|
||||
|
||||
const { transcription } = useContext(MediaPlayerProviderContext);
|
||||
const { learningLanguage, ipaMappings } = useContext(AppSettingsProviderContext);
|
||||
const { learningLanguage, ipaMappings } = useContext(
|
||||
AppSettingsProviderContext
|
||||
);
|
||||
|
||||
const word = selectedIndices
|
||||
.map((index) => caption.timeline[index]?.text || "")
|
||||
|
||||
@@ -34,7 +34,7 @@ import { useHotkeys } from "react-hotkeys-hook";
|
||||
import cloneDeep from "lodash/cloneDeep";
|
||||
import debounce from "lodash/debounce";
|
||||
import { AlignmentResult } from "echogarden/dist/api/API.d.js";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
|
||||
|
||||
const PLAYBACK_RATE_OPTIONS = [0.75, 0.8, 0.9, 1.0];
|
||||
export const MediaPlayerControls = () => {
|
||||
@@ -57,7 +57,7 @@ export const MediaPlayerControls = () => {
|
||||
setTranscriptionDraft,
|
||||
} = useContext(MediaPlayerProviderContext);
|
||||
const { EnjoyApp } = useContext(AppSettingsProviderContext);
|
||||
const { currentHotkeys, enabled } = useContext(
|
||||
const { currentHotkeys } = useContext(
|
||||
HotKeysSettingsProviderContext
|
||||
);
|
||||
const [playMode, setPlayMode] = useState<"loop" | "single" | "all">("single");
|
||||
|
||||
@@ -76,7 +76,9 @@ export const MediaTranscriptionGenerateButton = (props: {
|
||||
generateTranscription({
|
||||
originalText: data.text,
|
||||
language: data.language,
|
||||
service: data.service as WhisperConfigType["service"],
|
||||
service: data.service as
|
||||
| WhisperConfigType["service"]
|
||||
| "upload",
|
||||
isolate: data.isolate,
|
||||
})
|
||||
.then(() => {
|
||||
|
||||
@@ -9,7 +9,7 @@ import { AlignmentResult } from "echogarden/dist/api/API.d.js";
|
||||
import { convertWordIpaToNormal } from "@/utils";
|
||||
import template from "./transcription.template.html?raw";
|
||||
|
||||
export const MediaTranscriptionDownload = () => {
|
||||
export const MediaTranscriptionPrint = () => {
|
||||
const { media, transcription } = useContext(MediaPlayerProviderContext);
|
||||
const { EnjoyApp, learningLanguage, ipaMappings } = useContext(
|
||||
AppSettingsProviderContext
|
||||
@@ -59,7 +59,7 @@ export const MediaTranscriptionDownload = () => {
|
||||
async function download() {
|
||||
try {
|
||||
const savePath = await EnjoyApp.dialog.showSaveDialog({
|
||||
title: t("download"),
|
||||
title: t("print"),
|
||||
defaultPath: `${media.name}.pdf`,
|
||||
});
|
||||
|
||||
@@ -75,7 +75,7 @@ export const MediaTranscriptionDownload = () => {
|
||||
|
||||
return (
|
||||
<Button variant="ghost" className="block w-full" onClick={download}>
|
||||
{t("download")}
|
||||
{t("print")}
|
||||
</Button>
|
||||
);
|
||||
};
|
||||
@@ -28,7 +28,7 @@ import {
|
||||
SheetHeader,
|
||||
toast,
|
||||
} from "@renderer/components/ui";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
|
||||
import { t } from "i18next";
|
||||
import WaveSurfer from "wavesurfer.js";
|
||||
import {
|
||||
|
||||
@@ -26,7 +26,7 @@ import { formatDuration } from "@renderer/lib/utils";
|
||||
import {
|
||||
MediaTranscriptionReadButton,
|
||||
MediaTranscriptionGenerateButton,
|
||||
MediaTranscriptionDownload,
|
||||
MediaTranscriptionPrint,
|
||||
TranscriptionEditButton,
|
||||
} from "@renderer/components";
|
||||
|
||||
@@ -165,7 +165,7 @@ export const MediaTranscription = (props: { display?: boolean }) => {
|
||||
</TranscriptionEditButton>
|
||||
</DropdownMenuItem>
|
||||
<DropdownMenuItem asChild>
|
||||
<MediaTranscriptionDownload />
|
||||
<MediaTranscriptionPrint />
|
||||
</DropdownMenuItem>
|
||||
</DropdownMenuContent>
|
||||
</DropdownMenu>
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
|
||||
import { useContext, useState } from "react";
|
||||
import { WavesurferPlayer } from "@/renderer/components/misc";
|
||||
import { AppSettingsProviderContext } from "@/renderer/context";
|
||||
|
||||
@@ -3,14 +3,11 @@ import {
|
||||
AppSettingsProviderContext,
|
||||
} from "@renderer/context";
|
||||
import { zodResolver } from "@hookform/resolvers/zod";
|
||||
import { useContext, useState } from "react";
|
||||
import { useContext } from "react";
|
||||
import { useForm } from "react-hook-form";
|
||||
import { z } from "zod";
|
||||
import {
|
||||
Button,
|
||||
Collapsible,
|
||||
CollapsibleContent,
|
||||
CollapsibleTrigger,
|
||||
Form,
|
||||
FormDescription,
|
||||
FormField,
|
||||
@@ -31,8 +28,9 @@ import {
|
||||
} from "@renderer/components/ui";
|
||||
import { t } from "i18next";
|
||||
import { LANGUAGES } from "@/constants";
|
||||
import { ChevronDownIcon, ChevronUpIcon, LoaderIcon } from "lucide-react";
|
||||
import { LoaderIcon } from "lucide-react";
|
||||
import { parseText } from "media-captions";
|
||||
import { milisecondsToTimestamp } from "@/utils";
|
||||
|
||||
const transcriptionSchema = z.object({
|
||||
language: z.string(),
|
||||
@@ -59,18 +57,28 @@ export const TranscriptionCreateForm = (props: {
|
||||
} = props;
|
||||
const { learningLanguage } = useContext(AppSettingsProviderContext);
|
||||
const { whisperConfig } = useContext(AISettingsProviderContext);
|
||||
const [collapsibleOpen, setCollapsibleOpen] = useState(false);
|
||||
|
||||
const form = useForm<z.infer<typeof transcriptionSchema>>({
|
||||
resolver: zodResolver(transcriptionSchema),
|
||||
values: {
|
||||
language: learningLanguage,
|
||||
service: whisperConfig.service,
|
||||
service: originalText ? "upload" : whisperConfig.service,
|
||||
text: originalText,
|
||||
isolate: false,
|
||||
},
|
||||
});
|
||||
|
||||
const handleSubmit = (data: z.infer<typeof transcriptionSchema>) => {
|
||||
const { service, text } = data;
|
||||
|
||||
if (service === "upload" && !text) {
|
||||
toast.error(t("pleaseUploadTranscriptFile"));
|
||||
return;
|
||||
}
|
||||
|
||||
onSubmit(data);
|
||||
};
|
||||
|
||||
const parseSubtitle = (file: File) => {
|
||||
const fileType = file.name.split(".").pop();
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
@@ -88,7 +96,16 @@ export const TranscriptionCreateForm = (props: {
|
||||
if (caption.cues.length === 0) {
|
||||
text = cleanSubtitleText(text as string);
|
||||
} else {
|
||||
text = caption.cues.map((cue) => cue.text).join("\n");
|
||||
// Write cues to text in SRT format
|
||||
text = caption.cues
|
||||
.map((cue, _) => {
|
||||
return `${milisecondsToTimestamp(
|
||||
cue.startTime * 1000
|
||||
)} --> ${milisecondsToTimestamp(cue.endTime * 1000)}\n${
|
||||
cue.text
|
||||
}`;
|
||||
})
|
||||
.join("\n\n");
|
||||
}
|
||||
|
||||
if (text.length === 0) {
|
||||
@@ -126,7 +143,7 @@ export const TranscriptionCreateForm = (props: {
|
||||
return (
|
||||
<Form {...form}>
|
||||
<form
|
||||
onSubmit={form.handleSubmit(onSubmit)}
|
||||
onSubmit={form.handleSubmit(handleSubmit)}
|
||||
className="gap-4 grid w-full"
|
||||
>
|
||||
<FormField
|
||||
@@ -150,8 +167,21 @@ export const TranscriptionCreateForm = (props: {
|
||||
{t("cloudflareAi")}
|
||||
</SelectItem>
|
||||
<SelectItem value="openai">OpenAI</SelectItem>
|
||||
<SelectItem value="upload">{t("upload")}</SelectItem>
|
||||
</SelectContent>
|
||||
</Select>
|
||||
<FormDescription>
|
||||
{form.watch("service") === "local" &&
|
||||
t("localSpeechToTextDescription")}
|
||||
{form.watch("service") === "azure" &&
|
||||
t("azureSpeechToTextDescription")}
|
||||
{form.watch("service") === "cloudflare" &&
|
||||
t("cloudflareSpeechToTextDescription")}
|
||||
{form.watch("service") === "openai" &&
|
||||
t("openaiSpeechToTextDescription")}
|
||||
{form.watch("service") === "upload" &&
|
||||
t("uploadSpeechToTextDescription")}
|
||||
</FormDescription>
|
||||
</FormItem>
|
||||
)}
|
||||
/>
|
||||
@@ -181,16 +211,14 @@ export const TranscriptionCreateForm = (props: {
|
||||
</FormItem>
|
||||
)}
|
||||
/>
|
||||
<Collapsible open={collapsibleOpen} onOpenChange={setCollapsibleOpen}>
|
||||
<CollapsibleContent className="space-y-4 mb-4">
|
||||
{form.watch("service") === "upload" && (
|
||||
<>
|
||||
<FormField
|
||||
control={form.control}
|
||||
name="text"
|
||||
render={({ field }) => (
|
||||
<FormItem className="grid w-full items-center">
|
||||
<FormLabel>
|
||||
{t("uploadTranscriptFile")}({t("optinal")})
|
||||
</FormLabel>
|
||||
<FormLabel>{t("uploadTranscriptFile")}</FormLabel>
|
||||
<Input
|
||||
disabled={transcribing}
|
||||
type="file"
|
||||
@@ -245,25 +273,8 @@ export const TranscriptionCreateForm = (props: {
|
||||
</FormItem>
|
||||
)}
|
||||
/>
|
||||
</CollapsibleContent>
|
||||
<div className="flex justify-center">
|
||||
<CollapsibleTrigger asChild>
|
||||
<Button variant="ghost" size="sm">
|
||||
{collapsibleOpen ? (
|
||||
<>
|
||||
<ChevronUpIcon className="h-4 w-4" />
|
||||
<span className="ml-2">{t("lessOptions")}</span>
|
||||
</>
|
||||
) : (
|
||||
<>
|
||||
<ChevronDownIcon className="h-4 w-4" />
|
||||
<span className="ml-2">{t("moreOptions")}</span>
|
||||
</>
|
||||
)}
|
||||
</Button>
|
||||
</CollapsibleTrigger>
|
||||
</div>
|
||||
</Collapsible>
|
||||
</>
|
||||
)}
|
||||
|
||||
<TranscribeProgress
|
||||
service={form.watch("service")}
|
||||
|
||||
@@ -20,26 +20,36 @@ import {
|
||||
Textarea,
|
||||
toast,
|
||||
} from "@renderer/components/ui";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
|
||||
import { t } from "i18next";
|
||||
import { useContext, useState } from "react";
|
||||
import { useContext, useEffect, useState } from "react";
|
||||
import { LoaderIcon } from "lucide-react";
|
||||
import { milisecondsToTimestamp } from "@/utils";
|
||||
|
||||
export const TranscriptionEditButton = (props: {
|
||||
children?: React.ReactNode;
|
||||
}) => {
|
||||
const [open, setOpen] = useState(false);
|
||||
const [submiting, setSubmiting] = useState(false);
|
||||
const { transcription, generateTranscription } = useContext(
|
||||
const { media, transcription, generateTranscription } = useContext(
|
||||
MediaPlayerProviderContext
|
||||
);
|
||||
const [open, setOpen] = useState(false);
|
||||
const [submiting, setSubmiting] = useState(false);
|
||||
const [content, setContent] = useState<string>(
|
||||
transcription.result.timeline.map((t: TimelineEntry) => t.text).join("\n\n")
|
||||
// generate text in SRT format from timeline entries
|
||||
transcription.result.timeline
|
||||
.map(
|
||||
(t: TimelineEntry) =>
|
||||
`${milisecondsToTimestamp(
|
||||
t.startTime * 1000
|
||||
)} --> ${milisecondsToTimestamp(t.endTime * 1000)}\n${t.text}`
|
||||
)
|
||||
.join("\n\n")
|
||||
);
|
||||
const [downloadUrl, setDownloadUrl] = useState<string>();
|
||||
|
||||
const handleSave = async () => {
|
||||
setSubmiting(true);
|
||||
generateTranscription({ originalText: content })
|
||||
generateTranscription({ originalText: content, service: "upload" })
|
||||
.then(() => setOpen(false))
|
||||
.catch((e) => {
|
||||
toast.error(e.message);
|
||||
@@ -47,6 +57,13 @@ export const TranscriptionEditButton = (props: {
|
||||
.finally(() => setSubmiting(false));
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
if (!content) return;
|
||||
|
||||
const blob = new Blob([content], { type: "text/html" });
|
||||
setDownloadUrl(URL.createObjectURL(blob));
|
||||
}, [content]);
|
||||
|
||||
return (
|
||||
<Dialog open={open} onOpenChange={setOpen}>
|
||||
<DialogTrigger asChild>
|
||||
@@ -76,6 +93,11 @@ export const TranscriptionEditButton = (props: {
|
||||
{t("cancel")}
|
||||
</Button>
|
||||
</DialogClose>
|
||||
<DialogClose asChild>
|
||||
<a download={`${media.name}.srt`} href={downloadUrl}>
|
||||
<Button variant="secondary">{t("download")}</Button>
|
||||
</a>
|
||||
</DialogClose>
|
||||
|
||||
<AlertDialog>
|
||||
<AlertDialogTrigger asChild>
|
||||
|
||||
@@ -68,7 +68,7 @@ type MediaPlayerContextType = {
|
||||
generateTranscription: (params?: {
|
||||
originalText?: string;
|
||||
language?: string;
|
||||
service?: WhisperConfigType["service"];
|
||||
service?: WhisperConfigType["service"] | "upload";
|
||||
isolate?: boolean;
|
||||
}) => Promise<void>;
|
||||
transcribing: boolean;
|
||||
@@ -352,7 +352,7 @@ export const MediaPlayerProvider = ({
|
||||
|
||||
let phones: TimelineEntry[] = [];
|
||||
words.forEach((word: TimelineEntry) => {
|
||||
word.timeline.forEach((token: TimelineEntry) => {
|
||||
word.timeline?.forEach((token: TimelineEntry) => {
|
||||
phones = phones.concat(token.timeline);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -8,9 +8,92 @@ import { t } from "i18next";
|
||||
import { AI_WORKER_ENDPOINT } from "@/constants";
|
||||
import * as sdk from "microsoft-cognitiveservices-speech-sdk";
|
||||
import axios from "axios";
|
||||
import { AlignmentResult } from "echogarden/dist/api/API.d.js";
|
||||
import { useAiCommand } from "./use-ai-command";
|
||||
import { toast } from "@renderer/components/ui";
|
||||
import {
|
||||
Timeline,
|
||||
TimelineEntry,
|
||||
type TimelineEntryType,
|
||||
} from "echogarden/dist/utilities/Timeline";
|
||||
import take from "lodash/take";
|
||||
import sortedUniqBy from "lodash/sortedUniqBy";
|
||||
import { parseText } from "media-captions";
|
||||
|
||||
/*
|
||||
* define the regex pattern to match the end of a sentence
|
||||
* the end of a sentence is defined as a period, question mark, or exclamation mark
|
||||
* also it may be followed by a quotation mark
|
||||
* and exclude sepecial cases like "Mr.", "Mrs.", "Dr.", "Ms.", "etc."
|
||||
*/
|
||||
const sentenceEndPattern = /(?<!Mr|Mrs|Dr|Ms|etc)\.|\?|!\"?/;
|
||||
|
||||
// test a text string has any punctuations or not
|
||||
// some transcribed text may not have any punctuations
|
||||
const punctuationsPattern = /\w[.,!?](\s|$)/g;
|
||||
|
||||
/*
|
||||
* convert the word timeline to sentence timeline
|
||||
* a sentence is a group of words that ends with a punctuation
|
||||
*/
|
||||
const wordTimelineToSentenceTimeline = (
|
||||
wordTimeline: TimelineEntry[]
|
||||
): TimelineEntry[] => {
|
||||
const timeline: TimelineEntry[] = [];
|
||||
|
||||
wordTimeline.forEach((word, index) => {
|
||||
word.text = word.text.trim();
|
||||
// skip empty words
|
||||
if (!word.text) return;
|
||||
// skip music or sound effects quoted in []
|
||||
if (word.text.match(/^\[.*\]$/)) return;
|
||||
|
||||
const wordEntry = {
|
||||
type: "word" as TimelineEntryType,
|
||||
text: word.text,
|
||||
startTime: word.startTime,
|
||||
endTime: word.endTime,
|
||||
};
|
||||
|
||||
let sentence: TimelineEntry;
|
||||
// get the last sentence in the timeline
|
||||
if (timeline.length > 0) {
|
||||
sentence = timeline[timeline.length - 1];
|
||||
}
|
||||
|
||||
// if there is no sentence in the timeline, create a new sentence
|
||||
// if last sentence is a punctuation, create a new sentence
|
||||
if (!sentence || sentence.text.match(sentenceEndPattern)) {
|
||||
sentence = {
|
||||
type: "sentence" as TimelineEntryType,
|
||||
text: "",
|
||||
startTime: wordEntry.startTime,
|
||||
endTime: wordEntry.endTime,
|
||||
timeline: [],
|
||||
};
|
||||
timeline.push(sentence);
|
||||
}
|
||||
|
||||
// if the word is a punctuation, add it to the sentence and start a new sentence
|
||||
if (wordEntry.text.match(sentenceEndPattern)) {
|
||||
sentence.text += wordEntry.text;
|
||||
sentence.endTime = wordEntry.endTime;
|
||||
|
||||
const lastSentence = timeline[timeline.length - 1];
|
||||
if (lastSentence.endTime !== sentence.endTime) {
|
||||
timeline.push(sentence);
|
||||
}
|
||||
} else {
|
||||
sentence.text += wordEntry.text + " ";
|
||||
sentence.endTime = wordEntry.endTime;
|
||||
|
||||
if (index === wordTimeline.length - 1) {
|
||||
timeline.push(sentence);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return timeline;
|
||||
};
|
||||
|
||||
export const useTranscribe = () => {
|
||||
const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
|
||||
@@ -37,13 +120,14 @@ export const useTranscribe = () => {
|
||||
targetType?: string;
|
||||
originalText?: string;
|
||||
language: string;
|
||||
service: WhisperConfigType["service"];
|
||||
service: WhisperConfigType["service"] | "upload";
|
||||
isolate?: boolean;
|
||||
}
|
||||
): Promise<{
|
||||
engine: string;
|
||||
model: string;
|
||||
alignmentResult: AlignmentResult;
|
||||
transcript: string;
|
||||
timeline: TimelineEntry[];
|
||||
originalText?: string;
|
||||
tokenId?: number;
|
||||
}> => {
|
||||
@@ -58,67 +142,152 @@ export const useTranscribe = () => {
|
||||
} = params || {};
|
||||
const blob = await (await fetch(url)).blob();
|
||||
|
||||
let result;
|
||||
if (originalText) {
|
||||
result = {
|
||||
engine: "original",
|
||||
model: "original",
|
||||
};
|
||||
let result: any;
|
||||
let timeline: Timeline = [];
|
||||
if (service === "upload" && originalText) {
|
||||
const caption = await parseText(originalText, { type: "srt" });
|
||||
if (caption.cues.length > 0) {
|
||||
timeline = caption.cues.map((cue) => {
|
||||
return {
|
||||
type: "sentence",
|
||||
text: cue.text,
|
||||
startTime: cue.startTime,
|
||||
endTime: cue.endTime,
|
||||
timeline: [],
|
||||
};
|
||||
});
|
||||
result = {
|
||||
engine: "upload",
|
||||
model: "-",
|
||||
text: timeline.map((entry) => entry.text).join(" "),
|
||||
timeline,
|
||||
};
|
||||
} else {
|
||||
result = {
|
||||
engine: "upload",
|
||||
model: "-",
|
||||
text: originalText,
|
||||
};
|
||||
}
|
||||
} else if (service === "local") {
|
||||
result = await transcribeByLocal(url, language);
|
||||
} else if (service === "cloudflare") {
|
||||
result = await transcribeByCloudflareAi(blob);
|
||||
} else if (service === "openai") {
|
||||
result = await transcribeByOpenAi(blob);
|
||||
result = await transcribeByOpenAi(
|
||||
new File([blob], "audio.mp3", { type: "audio/mp3" })
|
||||
);
|
||||
} else if (service === "azure") {
|
||||
result = await transcribeByAzureAi(blob, language, {
|
||||
targetId,
|
||||
targetType,
|
||||
});
|
||||
result = await transcribeByAzureAi(
|
||||
new File([blob], "audio.wav", { type: "audio/wav" }),
|
||||
language,
|
||||
{
|
||||
targetId,
|
||||
targetType,
|
||||
}
|
||||
);
|
||||
} else {
|
||||
throw new Error(t("whisperServiceNotSupported"));
|
||||
}
|
||||
let transcript = result.text;
|
||||
|
||||
setOutput(null);
|
||||
/*
|
||||
* if timeline is available and the transcript contains punctuations
|
||||
* use `alignSegments` to align each sentence with the timeline
|
||||
* otherwise, use `align` to align the whole transcript
|
||||
* if the transcript does not contain any punctuation, use AI command to add punctuation
|
||||
*/
|
||||
if (result.timeline?.length && transcript.match(punctuationsPattern)) {
|
||||
timeline = [...result.timeline];
|
||||
setOutput("Aligning the transcript...");
|
||||
const wordTimeline = await EnjoyApp.echogarden.alignSegments(
|
||||
new Uint8Array(await blob.arrayBuffer()),
|
||||
timeline,
|
||||
{
|
||||
language,
|
||||
isolate,
|
||||
}
|
||||
);
|
||||
|
||||
let transcript = originalText || result.text;
|
||||
wordTimeline.forEach((word: TimelineEntry) => {
|
||||
let sentence = timeline.find(
|
||||
(entry) =>
|
||||
word.startTime >= entry.startTime && word.endTime <= entry.endTime
|
||||
);
|
||||
|
||||
// Remove all content inside `()`, `[]`, `{}` and trim the text
|
||||
// remove all markdown formatting
|
||||
transcript = transcript
|
||||
.replace(/\(.*?\)/g, "")
|
||||
.replace(/\[.*?\]/g, "")
|
||||
.replace(/\{.*?\}/g, "")
|
||||
.replace(/[*_`]/g, "")
|
||||
.trim();
|
||||
if (sentence) {
|
||||
sentence.timeline.push(word);
|
||||
}
|
||||
});
|
||||
|
||||
// if the transcript does not contain any punctuation, use AI command to add punctuation
|
||||
if (!transcript.match(/\w[.,!?](\s|$)/)) {
|
||||
try {
|
||||
transcript = await punctuateText(transcript);
|
||||
} catch (err) {
|
||||
toast.error(err.message);
|
||||
console.warn(err.message);
|
||||
/*
|
||||
* the start time of a sentence should be the start time of the first word in the sentence
|
||||
* the end time of a sentence should the end time of the last word in the sentence
|
||||
*/
|
||||
// timeline.forEach((t) => {
|
||||
// if (t.timeline.length === 0) return;
|
||||
|
||||
// t.startTime = t.timeline[0].startTime;
|
||||
// t.endTime = t.timeline[t.timeline.length - 1].endTime;
|
||||
// });
|
||||
} else {
|
||||
// Remove all content inside `()`, `[]`, `{}` and trim the text
|
||||
// remove all markdown formatting
|
||||
transcript = transcript
|
||||
.replace(/\(.*?\)/g, "")
|
||||
.replace(/\[.*?\]/g, "")
|
||||
.replace(/\{.*?\}/g, "")
|
||||
.replace(/[*_`]/g, "")
|
||||
.trim();
|
||||
|
||||
// if the transcript does not contain any punctuation, use AI command to add punctuation
|
||||
if (!transcript.match(punctuationsPattern)) {
|
||||
try {
|
||||
transcript = await punctuateText(transcript);
|
||||
} catch (err) {
|
||||
toast.error(err.message);
|
||||
console.warn(err.message);
|
||||
}
|
||||
}
|
||||
|
||||
setOutput("Aligning the transcript...");
|
||||
const alignmentResult = await EnjoyApp.echogarden.align(
|
||||
new Uint8Array(await blob.arrayBuffer()),
|
||||
transcript,
|
||||
{
|
||||
language,
|
||||
isolate,
|
||||
}
|
||||
);
|
||||
|
||||
alignmentResult.timeline.forEach((t: TimelineEntry) => {
|
||||
if (t.type === "sentence") {
|
||||
timeline.push(t);
|
||||
} else {
|
||||
t.timeline.forEach((st) => {
|
||||
timeline.push(st);
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
const alignmentResult = await EnjoyApp.echogarden.align(
|
||||
new Uint8Array(await blob.arrayBuffer()),
|
||||
transcript,
|
||||
{
|
||||
language,
|
||||
isolate,
|
||||
}
|
||||
);
|
||||
|
||||
return {
|
||||
...result,
|
||||
originalText,
|
||||
alignmentResult,
|
||||
transcript,
|
||||
timeline,
|
||||
};
|
||||
};
|
||||
|
||||
const transcribeByLocal = async (url: string, language?: string) => {
|
||||
const transcribeByLocal = async (
|
||||
url: string,
|
||||
language?: string
|
||||
): Promise<{
|
||||
engine: string;
|
||||
model: string;
|
||||
text: string;
|
||||
timeline: TimelineEntry[];
|
||||
}> => {
|
||||
const res = await EnjoyApp.whisper.transcribe(
|
||||
{
|
||||
file: url,
|
||||
@@ -130,14 +299,25 @@ export const useTranscribe = () => {
|
||||
}
|
||||
);
|
||||
|
||||
const wordTimeline: TimelineEntry[] = res.transcription.map((word) => {
|
||||
return {
|
||||
type: "word" as TimelineEntryType,
|
||||
text: word.text,
|
||||
startTime: word.offsets.from / 1000.0,
|
||||
endTime: word.offsets.to / 1000.0,
|
||||
};
|
||||
});
|
||||
const timeline = wordTimelineToSentenceTimeline(wordTimeline);
|
||||
|
||||
return {
|
||||
engine: "whisper",
|
||||
model: res.model.type,
|
||||
text: res.transcription.map((segment) => segment.text).join(" "),
|
||||
timeline,
|
||||
};
|
||||
};
|
||||
|
||||
const transcribeByOpenAi = async (blob: Blob) => {
|
||||
const transcribeByOpenAi = async (file: File) => {
|
||||
if (!openai?.key) {
|
||||
throw new Error(t("openaiKeyRequired"));
|
||||
}
|
||||
@@ -149,20 +329,58 @@ export const useTranscribe = () => {
|
||||
maxRetries: 0,
|
||||
});
|
||||
|
||||
const res: { text: string } = (await client.audio.transcriptions.create({
|
||||
file: new File([blob], "audio.wav"),
|
||||
const res: {
|
||||
text: string;
|
||||
words?: { word: string; start: number; end: number }[];
|
||||
segments?: { text: string; start: number; end: number }[];
|
||||
} = (await client.audio.transcriptions.create({
|
||||
file,
|
||||
model: "whisper-1",
|
||||
response_format: "json",
|
||||
response_format: "verbose_json",
|
||||
timestamp_granularities: ["word"],
|
||||
})) as any;
|
||||
|
||||
let timeline: TimelineEntry[] = [];
|
||||
if (res.segments) {
|
||||
res.segments.forEach((segment) => {
|
||||
const segmentTimeline = {
|
||||
type: "sentence" as TimelineEntryType,
|
||||
text: segment.text,
|
||||
startTime: segment.start,
|
||||
endTime: segment.end,
|
||||
timeline: [] as Timeline,
|
||||
};
|
||||
|
||||
timeline.push(segmentTimeline);
|
||||
});
|
||||
} else if (res.words) {
|
||||
const wordTimeline = res.words.map((word) => {
|
||||
return {
|
||||
type: "word" as TimelineEntryType,
|
||||
text: word.word,
|
||||
startTime: word.start,
|
||||
endTime: word.end,
|
||||
};
|
||||
});
|
||||
timeline = wordTimelineToSentenceTimeline(wordTimeline);
|
||||
}
|
||||
|
||||
return {
|
||||
engine: "openai",
|
||||
model: "whisper-1",
|
||||
text: res.text,
|
||||
timeline,
|
||||
};
|
||||
};
|
||||
|
||||
const transcribeByCloudflareAi = async (blob: Blob) => {
|
||||
const transcribeByCloudflareAi = async (
|
||||
blob: Blob
|
||||
): Promise<{
|
||||
engine: string;
|
||||
model: string;
|
||||
text: string;
|
||||
timeline?: TimelineEntry[];
|
||||
}> => {
|
||||
const res: CfWhipserOutputType = (
|
||||
await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, blob, {
|
||||
headers: {
|
||||
@@ -172,15 +390,26 @@ export const useTranscribe = () => {
|
||||
})
|
||||
).data;
|
||||
|
||||
const wordTimeline = res.words.map((word) => {
|
||||
return {
|
||||
type: "word" as TimelineEntryType,
|
||||
text: word.word,
|
||||
startTime: word.start,
|
||||
endTime: word.end,
|
||||
};
|
||||
});
|
||||
const timeline = wordTimelineToSentenceTimeline(wordTimeline);
|
||||
|
||||
return {
|
||||
engine: "cloudflare",
|
||||
model: "@cf/openai/whisper",
|
||||
text: res.text,
|
||||
timeline,
|
||||
};
|
||||
};
|
||||
|
||||
const transcribeByAzureAi = async (
|
||||
blob: Blob,
|
||||
file: File,
|
||||
language: string,
|
||||
params?: {
|
||||
targetId?: string;
|
||||
@@ -191,12 +420,11 @@ export const useTranscribe = () => {
|
||||
model: string;
|
||||
text: string;
|
||||
tokenId: number;
|
||||
timeline?: TimelineEntry[];
|
||||
}> => {
|
||||
const { id, token, region } = await webApi.generateSpeechToken(params);
|
||||
const config = sdk.SpeechConfig.fromAuthorizationToken(token, region);
|
||||
const audioConfig = sdk.AudioConfig.fromWavFileInput(
|
||||
new File([blob], "audio.wav")
|
||||
);
|
||||
const audioConfig = sdk.AudioConfig.fromWavFileInput(file);
|
||||
// setting the recognition language to learning language, such as 'en-US'.
|
||||
config.speechRecognitionLanguage = language;
|
||||
config.requestWordLevelTimestamps();
|
||||
@@ -209,7 +437,6 @@ export const useTranscribe = () => {
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
reco.recognizing = (_s, e) => {
|
||||
console.log(e.result);
|
||||
setOutput(e.result.text);
|
||||
};
|
||||
|
||||
@@ -232,10 +459,40 @@ export const useTranscribe = () => {
|
||||
reco.sessionStopped = (_s, _e) => {
|
||||
reco.stopContinuousRecognitionAsync();
|
||||
|
||||
const wordTimeline: TimelineEntry[] = [];
|
||||
results.forEach((result) => {
|
||||
const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
|
||||
const splitedWords = best.Display.trim().split(" ");
|
||||
|
||||
best.Words.forEach((word, index) => {
|
||||
let text = word.Word;
|
||||
if (splitedWords.length === best.Words.length) {
|
||||
text = splitedWords[index];
|
||||
}
|
||||
|
||||
if (
|
||||
index === best.Words.length - 1 &&
|
||||
!text.trim().match(sentenceEndPattern)
|
||||
) {
|
||||
text = text + ".";
|
||||
}
|
||||
|
||||
wordTimeline.push({
|
||||
type: "word" as TimelineEntryType,
|
||||
text,
|
||||
startTime: word.Offset / 10000000.0,
|
||||
endTime: (word.Offset + word.Duration) / 10000000.0,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
const timeline = wordTimelineToSentenceTimeline(wordTimeline);
|
||||
|
||||
resolve({
|
||||
engine: "azure",
|
||||
model: "whisper",
|
||||
text: results.map((result) => result.DisplayText).join(" "),
|
||||
timeline,
|
||||
tokenId: id,
|
||||
});
|
||||
};
|
||||
|
||||
@@ -20,9 +20,9 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
|
||||
const [transcribing, setTranscribing] = useState<boolean>(false);
|
||||
const [transcribingOutput, setTranscribingOutput] = useState<string>("");
|
||||
const [service, setService] = useState<WhisperConfigType["service"]>(
|
||||
whisperConfig.service
|
||||
);
|
||||
const [service, setService] = useState<
|
||||
WhisperConfigType["service"] | "upload"
|
||||
>(whisperConfig.service);
|
||||
|
||||
const onTransactionUpdate = (event: CustomEvent) => {
|
||||
if (!transcription) return;
|
||||
@@ -63,7 +63,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
const generateTranscription = async (params?: {
|
||||
originalText?: string;
|
||||
language?: string;
|
||||
service?: WhisperConfigType["service"];
|
||||
service?: WhisperConfigType["service"] | "upload";
|
||||
isolate?: boolean;
|
||||
}) => {
|
||||
let {
|
||||
@@ -87,7 +87,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
}
|
||||
}
|
||||
}
|
||||
const { engine, model, alignmentResult, tokenId } = await transcribe(
|
||||
const { engine, model, transcript, timeline, tokenId } = await transcribe(
|
||||
media.src,
|
||||
{
|
||||
targetId: media.id,
|
||||
@@ -99,18 +99,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
}
|
||||
);
|
||||
|
||||
let timeline: TimelineEntry[] = [];
|
||||
alignmentResult.timeline.forEach((t) => {
|
||||
if (t.type === "sentence") {
|
||||
timeline.push(t);
|
||||
} else {
|
||||
t.timeline.forEach((st) => {
|
||||
timeline.push(st);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
timeline = preProcessTranscription(timeline);
|
||||
const processedTimeline = preProcessTranscription(timeline);
|
||||
if (media.language !== language) {
|
||||
if (media.mediaType === "Video") {
|
||||
await EnjoyApp.videos.update(media.id, {
|
||||
@@ -126,8 +115,8 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
await EnjoyApp.transcriptions.update(transcription.id, {
|
||||
state: "finished",
|
||||
result: {
|
||||
timeline: timeline,
|
||||
transcript: alignmentResult.transcript,
|
||||
timeline: processedTimeline,
|
||||
transcript,
|
||||
originalText,
|
||||
tokenId,
|
||||
},
|
||||
|
||||
5
enjoy/src/types/enjoy-app.d.ts
vendored
5
enjoy/src/types/enjoy-app.d.ts
vendored
@@ -252,6 +252,11 @@ type EnjoyAppType = {
|
||||
transcript: string,
|
||||
options?: any
|
||||
) => Promise<AlignmentResult>;
|
||||
alignSegments: (
|
||||
input: string | Uint8Array,
|
||||
timeline: Timeline,
|
||||
options?: any
|
||||
) => Promise<Timeline>;
|
||||
transcode: (input: string) => Promise<string>;
|
||||
check: () => Promise<boolean>;
|
||||
};
|
||||
|
||||
@@ -49,7 +49,7 @@ export function milisecondsToTimestamp(ms: number) {
|
||||
const hours = Math.floor(ms / 3600000).toString();
|
||||
const minutes = Math.floor((ms % 3600000) / 60000).toString();
|
||||
const seconds = Math.floor(((ms % 360000) % 60000) / 1000).toString();
|
||||
const milliseconds = Math.floor(((ms % 360000) % 60000) % 1000).toString();
|
||||
const milliseconds = Math.round(((ms % 360000) % 60000) % 1000).toString();
|
||||
return `${hours.padStart(2, "0")}:${minutes.padStart(
|
||||
2,
|
||||
"0"
|
||||
|
||||
Reference in New Issue
Block a user