Improve stt (#764)
* refactor * add isolate option for alignment * setup isolate for alignment * refactor transcription form * add transribing output * update locale * refactor * display transcribing output * cannot cancel when transcribing
This commit is contained in:
@@ -611,8 +611,11 @@
|
||||
"assessing": "Assessing",
|
||||
"assessedSuccessfully": "Assessed successfully",
|
||||
"optinal": "Optional",
|
||||
"uploadTranscriptFile": "Upload transcript file(.txt/.srt/.vtt)",
|
||||
"uploadTranscriptFile": "Upload transcript file",
|
||||
"uploadTranscriptFileDescription": "Optional. Support formats: txt/srt/vtt.",
|
||||
"onlyTextFileIsSupported": "Only text file is supported",
|
||||
"isolateVoice": "Isolate voice",
|
||||
"isolateVoiceDescription": "Isolates voice from any music or background ambience. More accurate but slower",
|
||||
"sortBy": "Sort by",
|
||||
"createdAtDesc": "Created at desc",
|
||||
"createdAtAsc": "Created at asc",
|
||||
@@ -627,5 +630,6 @@
|
||||
"search": "Search",
|
||||
"noData": "No data",
|
||||
"selectedFiles": "Selected files",
|
||||
"moreOptions": "More options"
|
||||
"moreOptions": "More options",
|
||||
"lessOptions": "Less options"
|
||||
}
|
||||
|
||||
@@ -611,8 +611,11 @@
|
||||
"assessing": "正在评估",
|
||||
"assessedSuccessfully": "评估成功",
|
||||
"optinal": "可选",
|
||||
"uploadTranscriptFile": "上传字幕文件(.txt/.srt/.vtt)",
|
||||
"uploadTranscriptFile": "上传字幕文件",
|
||||
"uploadTranscriptFileDescription": "可选。支持字幕文件格式: txt/srt/vtt。",
|
||||
"onlyTextFileIsSupported": "仅支持文本文件",
|
||||
"isolateVoice": "提取人声",
|
||||
"isolateVoiceDescription": "将人声从音乐、背景音中隔离,字幕对齐会更准确,但耗时较久。",
|
||||
"sortBy": "排序",
|
||||
"createdAtDesc": "创建时间降序",
|
||||
"createdAtAsc": "创建时间升序",
|
||||
@@ -627,5 +630,6 @@
|
||||
"search": "搜索",
|
||||
"noData": "没有数据",
|
||||
"selectedFiles": "已选中文件",
|
||||
"moreOptions": "更多选项"
|
||||
"moreOptions": "更多选项",
|
||||
"lessOptions": "更少选项"
|
||||
}
|
||||
|
||||
@@ -431,6 +431,28 @@ ${log}
|
||||
return { action: "allow" };
|
||||
});
|
||||
|
||||
// Capture stderr & stdout and send them to renderer
|
||||
const originalStderrWrite = process.stderr.write.bind(process.stderr);
|
||||
process.stderr.write = (chunk, encoding?, callback?) => {
|
||||
// Remove ANSI color codes
|
||||
const output = chunk
|
||||
.toString()
|
||||
.replace(/\x1B\[([0-9]{1,3}(;[0-9]{1,2};?)?)?[mGK]/g, "");
|
||||
mainWindow.webContents.send("app-on-cmd-output", output);
|
||||
|
||||
return originalStderrWrite(chunk, encoding, callback);
|
||||
};
|
||||
const originalStdoutWrite = process.stdout.write.bind(process.stdout);
|
||||
process.stdout.write = (chunk, encoding?, callback?) => {
|
||||
// Remove ANSI color codes
|
||||
const output = chunk
|
||||
.toString()
|
||||
.replace(/\x1B\[([0-9]{1,3}(;[0-9]{1,2};?)?)?[mGK]/g, "");
|
||||
mainWindow.webContents.send("app-on-cmd-output", output);
|
||||
|
||||
return originalStdoutWrite(chunk, encoding, callback);
|
||||
};
|
||||
|
||||
// and load the index.html of the app.
|
||||
if (MAIN_WINDOW_VITE_DEV_SERVER_URL) {
|
||||
mainWindow.loadURL(MAIN_WINDOW_VITE_DEV_SERVER_URL);
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
// https://www.electronjs.org/docs/latest/tutorial/process-model#preload-scripts
|
||||
import { contextBridge, ipcRenderer, IpcRendererEvent } from "electron";
|
||||
import { version } from "../package.json";
|
||||
import { callback } from "chart.js/dist/helpers/helpers.core";
|
||||
import { remove } from "lodash";
|
||||
|
||||
contextBridge.exposeInMainWorld("__ENJOY_APP__", {
|
||||
app: {
|
||||
@@ -35,6 +37,12 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
|
||||
createIssue: (title: string, body: string) => {
|
||||
return ipcRenderer.invoke("app-create-issue", title, body);
|
||||
},
|
||||
onCmdOutput: (callback: (event: IpcRendererEvent, data: string) => void) => {
|
||||
ipcRenderer.on("app-on-cmd-output", callback);
|
||||
},
|
||||
removeCmdOutputListeners: () => {
|
||||
ipcRenderer.removeAllListeners("app-on-cmd-output");
|
||||
},
|
||||
version,
|
||||
},
|
||||
window: {
|
||||
|
||||
@@ -64,7 +64,7 @@ export const MediaCurrentRecording = () => {
|
||||
currentTime: mediaCurrentTime,
|
||||
} = useContext(MediaPlayerProviderContext);
|
||||
const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext);
|
||||
const { enabled, currentHotkeys } = useContext(
|
||||
const { currentHotkeys } = useContext(
|
||||
HotKeysSettingsProviderContext
|
||||
);
|
||||
const [player, setPlayer] = useState(null);
|
||||
|
||||
@@ -9,6 +9,7 @@ import {
|
||||
AlertDialogContent,
|
||||
AlertDialogTitle,
|
||||
AlertDialogDescription,
|
||||
toast,
|
||||
} from "@renderer/components/ui";
|
||||
import { LoaderIcon } from "lucide-react";
|
||||
import { TranscriptionCreateForm } from "../transcriptions";
|
||||
@@ -22,6 +23,7 @@ export const MediaTranscriptionGenerateButton = (props: {
|
||||
transcribing,
|
||||
transcription,
|
||||
transcribingProgress,
|
||||
transcribingOutput,
|
||||
} = useContext(MediaPlayerProviderContext);
|
||||
const [open, setOpen] = useState(false);
|
||||
|
||||
@@ -62,11 +64,18 @@ export const MediaTranscriptionGenerateButton = (props: {
|
||||
originalText: data.text,
|
||||
language: data.language,
|
||||
service: data.service as WhisperConfigType["service"],
|
||||
});
|
||||
setOpen(false);
|
||||
isolate: data.isolate,
|
||||
})
|
||||
.then(() => {
|
||||
setOpen(false);
|
||||
})
|
||||
.catch((e) => {
|
||||
toast.error(e.message);
|
||||
});
|
||||
}}
|
||||
transcribing={transcribing}
|
||||
transcribingProgress={transcribingProgress}
|
||||
transcribingOutput={transcribingOutput}
|
||||
/>
|
||||
</AlertDialogContent>
|
||||
</AlertDialog>
|
||||
|
||||
@@ -12,6 +12,7 @@ import {
|
||||
CollapsibleContent,
|
||||
CollapsibleTrigger,
|
||||
Form,
|
||||
FormDescription,
|
||||
FormField,
|
||||
FormItem,
|
||||
FormLabel,
|
||||
@@ -24,6 +25,7 @@ import {
|
||||
SelectItem,
|
||||
SelectTrigger,
|
||||
SelectValue,
|
||||
Switch,
|
||||
Textarea,
|
||||
toast,
|
||||
} from "@renderer/components/ui";
|
||||
@@ -36,18 +38,21 @@ const transcriptionSchema = z.object({
|
||||
language: z.string(),
|
||||
service: z.string(),
|
||||
text: z.string().optional(),
|
||||
isolate: z.boolean().optional(),
|
||||
});
|
||||
|
||||
export const TranscriptionCreateForm = (props: {
|
||||
onSubmit: (data: z.infer<typeof transcriptionSchema>) => void;
|
||||
originalText?: string;
|
||||
onCancel?: () => void;
|
||||
transcribing?: boolean;
|
||||
transcribingProgress?: number;
|
||||
transcribing: boolean;
|
||||
transcribingProgress: number;
|
||||
transcribingOutput: string;
|
||||
}) => {
|
||||
const {
|
||||
transcribing = false,
|
||||
transcribingProgress = 0,
|
||||
transcribingOutput,
|
||||
onSubmit,
|
||||
onCancel,
|
||||
originalText,
|
||||
@@ -62,6 +67,7 @@ export const TranscriptionCreateForm = (props: {
|
||||
language: learningLanguage,
|
||||
service: whisperConfig.service,
|
||||
text: originalText,
|
||||
isolate: false,
|
||||
},
|
||||
});
|
||||
|
||||
@@ -127,7 +133,7 @@ export const TranscriptionCreateForm = (props: {
|
||||
control={form.control}
|
||||
name="service"
|
||||
render={({ field }) => (
|
||||
<FormItem className="grid w-full items-center gap-1.5">
|
||||
<FormItem className="grid w-full items-center">
|
||||
<FormLabel>{t("sttAiService")}</FormLabel>
|
||||
<Select
|
||||
disabled={transcribing}
|
||||
@@ -153,7 +159,7 @@ export const TranscriptionCreateForm = (props: {
|
||||
control={form.control}
|
||||
name="language"
|
||||
render={({ field }) => (
|
||||
<FormItem className="grid w-full items-center gap-1.5">
|
||||
<FormItem className="grid w-full items-center">
|
||||
<FormLabel>{t("language")}</FormLabel>
|
||||
<Select
|
||||
disabled={transcribing}
|
||||
@@ -176,12 +182,12 @@ export const TranscriptionCreateForm = (props: {
|
||||
)}
|
||||
/>
|
||||
<Collapsible open={collapsibleOpen} onOpenChange={setCollapsibleOpen}>
|
||||
<CollapsibleContent>
|
||||
<CollapsibleContent className="space-y-4 mb-4">
|
||||
<FormField
|
||||
control={form.control}
|
||||
name="text"
|
||||
render={({ field }) => (
|
||||
<FormItem className="grid w-full items-center gap-1.5">
|
||||
<FormItem className="grid w-full items-center">
|
||||
<FormLabel>
|
||||
{t("uploadTranscriptFile")}({t("optinal")})
|
||||
</FormLabel>
|
||||
@@ -205,6 +211,9 @@ export const TranscriptionCreateForm = (props: {
|
||||
}
|
||||
}}
|
||||
/>
|
||||
<FormDescription>
|
||||
{t("uploadTranscriptFileDescription")}
|
||||
</FormDescription>
|
||||
{field.value != undefined && (
|
||||
<>
|
||||
<FormLabel>{t("transcript")}</FormLabel>
|
||||
@@ -219,45 +228,92 @@ export const TranscriptionCreateForm = (props: {
|
||||
</FormItem>
|
||||
)}
|
||||
/>
|
||||
<FormField
|
||||
control={form.control}
|
||||
name="isolate"
|
||||
render={({ field }) => (
|
||||
<FormItem className="grid w-full items-center">
|
||||
<FormLabel>{t("isolateVoice")}</FormLabel>
|
||||
<Switch
|
||||
checked={field.value}
|
||||
onCheckedChange={field.onChange}
|
||||
disabled={transcribing}
|
||||
/>
|
||||
<FormDescription>
|
||||
{t("isolateVoiceDescription")}
|
||||
</FormDescription>
|
||||
</FormItem>
|
||||
)}
|
||||
/>
|
||||
</CollapsibleContent>
|
||||
<div className="flex justify-center my-4">
|
||||
<div className="flex justify-center">
|
||||
<CollapsibleTrigger asChild>
|
||||
<Button variant="ghost" size="sm">
|
||||
<span className="">{t("moreOptions")}</span>
|
||||
{collapsibleOpen ? (
|
||||
<ChevronUpIcon className="h-4 w-4" />
|
||||
<>
|
||||
<ChevronUpIcon className="h-4 w-4" />
|
||||
<span className="ml-2">{t("lessOptions")}</span>
|
||||
</>
|
||||
) : (
|
||||
<ChevronDownIcon className="h-4 w-4" />
|
||||
<>
|
||||
<ChevronDownIcon className="h-4 w-4" />
|
||||
<span className="ml-2">{t("moreOptions")}</span>
|
||||
</>
|
||||
)}
|
||||
</Button>
|
||||
</CollapsibleTrigger>
|
||||
</div>
|
||||
</Collapsible>
|
||||
|
||||
{transcribing && form.watch("service") === "local" && (
|
||||
<div className="mb-4">
|
||||
<div className="flex items-center space-x-4 mb-2">
|
||||
<PingPoint colorClassName="bg-yellow-500" />
|
||||
<span>{t("transcribing")}</span>
|
||||
</div>
|
||||
{whisperConfig.service === "local" && (
|
||||
<Progress value={transcribingProgress} />
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
<TranscribeProgress
|
||||
service={form.watch("service")}
|
||||
transcribing={transcribing}
|
||||
transcribingProgress={transcribingProgress}
|
||||
transcribingOutput={transcribingOutput}
|
||||
/>
|
||||
|
||||
<div className="flex justify-end space-x-4">
|
||||
{onCancel && (
|
||||
{onCancel && !transcribing && (
|
||||
<Button type="reset" variant="outline" onClick={onCancel}>
|
||||
{t("cancel")}
|
||||
</Button>
|
||||
)}
|
||||
<Button disabled={transcribing} type="submit" variant="default">
|
||||
{transcribing && <LoaderIcon className="animate-spin w-4 mr-2" />}
|
||||
{t("transcribe")}
|
||||
{t("continue")}
|
||||
</Button>
|
||||
</div>
|
||||
</form>
|
||||
</Form>
|
||||
);
|
||||
};
|
||||
|
||||
const TranscribeProgress = (props: {
|
||||
service: string;
|
||||
transcribing: boolean;
|
||||
transcribingProgress: number;
|
||||
transcribingOutput?: string;
|
||||
}) => {
|
||||
const { service, transcribing, transcribingProgress, transcribingOutput } =
|
||||
props;
|
||||
if (!transcribing) return null;
|
||||
|
||||
return (
|
||||
<div className="mb-4 space-y-2">
|
||||
<div className="flex items-center space-x-4 mb-2">
|
||||
<PingPoint colorClassName="bg-yellow-500" />
|
||||
<span>{t("transcribing")}</span>
|
||||
</div>
|
||||
{service === "local" && transcribingProgress > 0 && (
|
||||
<Progress value={transcribingProgress} />
|
||||
)}
|
||||
{transcribingOutput && (
|
||||
<div className="max-w-full rounded-lg border bg-zinc-950 p-3 dark:bg-zinc-900 h-20 overflow-y-auto">
|
||||
<code className="px-[0.3rem] py-[0.2rem] rounded text-muted-foreground font-mono text-xs break-words">
|
||||
{transcribingOutput}
|
||||
</code>
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
@@ -39,14 +39,12 @@ export const TranscriptionEditButton = (props: {
|
||||
|
||||
const handleSave = async () => {
|
||||
setSubmiting(true);
|
||||
try {
|
||||
await generateTranscription({ originalText: content });
|
||||
setOpen(false);
|
||||
} catch (e) {
|
||||
toast.error(e.message);
|
||||
}
|
||||
|
||||
setSubmiting(false);
|
||||
generateTranscription({ originalText: content })
|
||||
.then(() => setOpen(false))
|
||||
.catch((e) => {
|
||||
toast.error(e.message);
|
||||
})
|
||||
.finally(() => setSubmiting(false));
|
||||
};
|
||||
|
||||
return (
|
||||
|
||||
@@ -69,9 +69,11 @@ type MediaPlayerContextType = {
|
||||
originalText?: string;
|
||||
language?: string;
|
||||
service?: WhisperConfigType["service"];
|
||||
}) => void;
|
||||
isolate?: boolean;
|
||||
}) => Promise<void>;
|
||||
transcribing: boolean;
|
||||
transcribingProgress: number;
|
||||
transcribingOutput: string;
|
||||
transcriptionDraft: TranscriptionType["result"];
|
||||
setTranscriptionDraft: (result: TranscriptionType["result"]) => void;
|
||||
// Recordings
|
||||
@@ -172,6 +174,7 @@ export const MediaPlayerProvider = ({
|
||||
generateTranscription,
|
||||
transcribing,
|
||||
transcribingProgress,
|
||||
transcribingOutput,
|
||||
abortGenerateTranscription,
|
||||
} = useTranscriptions(media);
|
||||
|
||||
@@ -611,6 +614,7 @@ export const MediaPlayerProvider = ({
|
||||
generateTranscription,
|
||||
transcribing,
|
||||
transcribingProgress,
|
||||
transcribingOutput,
|
||||
transcriptionDraft,
|
||||
setTranscriptionDraft,
|
||||
isRecording,
|
||||
|
||||
@@ -3,7 +3,7 @@ import {
|
||||
AISettingsProviderContext,
|
||||
} from "@renderer/context";
|
||||
import OpenAI from "openai";
|
||||
import { useContext } from "react";
|
||||
import { useContext, useState } from "react";
|
||||
import { t } from "i18next";
|
||||
import { AI_WORKER_ENDPOINT } from "@/constants";
|
||||
import * as sdk from "microsoft-cognitiveservices-speech-sdk";
|
||||
@@ -15,6 +15,7 @@ export const useTranscribe = () => {
|
||||
const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
|
||||
const { openai } = useContext(AISettingsProviderContext);
|
||||
const { punctuateText } = useAiCommand();
|
||||
const [output, setOutput] = useState<string>("");
|
||||
|
||||
const transcode = async (src: string | Blob): Promise<string> => {
|
||||
if (src instanceof Blob) {
|
||||
@@ -36,6 +37,7 @@ export const useTranscribe = () => {
|
||||
originalText?: string;
|
||||
language: string;
|
||||
service: WhisperConfigType["service"];
|
||||
isolate?: boolean;
|
||||
}
|
||||
): Promise<{
|
||||
engine: string;
|
||||
@@ -45,8 +47,14 @@ export const useTranscribe = () => {
|
||||
tokenId?: number;
|
||||
}> => {
|
||||
const url = await transcode(mediaSrc);
|
||||
const { targetId, targetType, originalText, language, service } =
|
||||
params || {};
|
||||
const {
|
||||
targetId,
|
||||
targetType,
|
||||
originalText,
|
||||
language,
|
||||
service,
|
||||
isolate = false,
|
||||
} = params || {};
|
||||
const blob = await (await fetch(url)).blob();
|
||||
|
||||
let result;
|
||||
@@ -70,6 +78,8 @@ export const useTranscribe = () => {
|
||||
throw new Error(t("whisperServiceNotSupported"));
|
||||
}
|
||||
|
||||
setOutput(null);
|
||||
|
||||
let transcript = originalText || result.text;
|
||||
|
||||
// Remove all content inside `()`, `[]`, `{}` and trim the text
|
||||
@@ -93,6 +103,7 @@ export const useTranscribe = () => {
|
||||
transcript,
|
||||
{
|
||||
language,
|
||||
isolate,
|
||||
}
|
||||
);
|
||||
|
||||
@@ -193,7 +204,8 @@ export const useTranscribe = () => {
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
reco.recognizing = (_s, e) => {
|
||||
console.log(e.result.text);
|
||||
console.log(e.result);
|
||||
setOutput(e.result.text);
|
||||
};
|
||||
|
||||
reco.recognized = (_s, e) => {
|
||||
@@ -230,5 +242,6 @@ export const useTranscribe = () => {
|
||||
return {
|
||||
transcode,
|
||||
transcribe,
|
||||
output,
|
||||
};
|
||||
};
|
||||
|
||||
@@ -16,11 +16,17 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
);
|
||||
const { addDblistener, removeDbListener } = useContext(DbProviderContext);
|
||||
const [transcription, setTranscription] = useState<TranscriptionType>(null);
|
||||
const { transcribe } = useTranscribe();
|
||||
const { transcribe, output } = useTranscribe();
|
||||
const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
|
||||
const [transcribing, setTranscribing] = useState<boolean>(false);
|
||||
const [transcribingOutput, setTranscribingOutput] = useState<string>("");
|
||||
const [service, setService] = useState<WhisperConfigType["service"]>(
|
||||
whisperConfig.service
|
||||
);
|
||||
|
||||
const onTransactionUpdate = (event: CustomEvent) => {
|
||||
if (!transcription) return;
|
||||
|
||||
const { model, action, record } = event.detail || {};
|
||||
if (
|
||||
model === "Transcription" &&
|
||||
@@ -58,12 +64,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
originalText?: string;
|
||||
language?: string;
|
||||
service?: WhisperConfigType["service"];
|
||||
isolate?: boolean;
|
||||
}) => {
|
||||
let {
|
||||
originalText,
|
||||
language = learningLanguage,
|
||||
service = whisperConfig.service,
|
||||
isolate = false,
|
||||
} = params || {};
|
||||
setService(service);
|
||||
|
||||
if (originalText === undefined) {
|
||||
if (transcription?.targetId === media.id) {
|
||||
originalText = transcription.result?.originalText;
|
||||
@@ -77,131 +87,135 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
|
||||
setTranscribing(true);
|
||||
setTranscribingProgress(0);
|
||||
try {
|
||||
const { engine, model, alignmentResult, tokenId } = await transcribe(
|
||||
media.src,
|
||||
{
|
||||
targetId: media.id,
|
||||
targetType: media.mediaType,
|
||||
originalText,
|
||||
language,
|
||||
service,
|
||||
}
|
||||
);
|
||||
|
||||
let timeline: TimelineEntry[] = [];
|
||||
alignmentResult.timeline.forEach((t) => {
|
||||
if (t.type === "sentence") {
|
||||
timeline.push(t);
|
||||
} else {
|
||||
t.timeline.forEach((st) => {
|
||||
timeline.push(st);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
/*
|
||||
* Pre-process
|
||||
* 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc
|
||||
* 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display;
|
||||
* 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display;
|
||||
*/
|
||||
try {
|
||||
timeline.forEach((sentence, i) => {
|
||||
const nextSentence = timeline[i + 1];
|
||||
if (
|
||||
!sentence.text
|
||||
.replaceAll(MAGIC_TOKEN_REGEX, "")
|
||||
.match(END_OF_SENTENCE_REGEX) &&
|
||||
nextSentence?.text
|
||||
) {
|
||||
nextSentence.text = [sentence.text, nextSentence.text].join(" ");
|
||||
nextSentence.timeline = [
|
||||
...sentence.timeline,
|
||||
...nextSentence.timeline,
|
||||
];
|
||||
nextSentence.startTime = sentence.startTime;
|
||||
timeline.splice(i, 1);
|
||||
} else {
|
||||
const words = sentence.text.split(" ");
|
||||
|
||||
sentence.timeline.forEach((token, j) => {
|
||||
const word = words[j]?.trim()?.toLowerCase();
|
||||
|
||||
const match = word?.match(/-|%/);
|
||||
if (!match) return;
|
||||
|
||||
if (
|
||||
word === "-" &&
|
||||
token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase()
|
||||
) {
|
||||
sentence.timeline.splice(j, 0, {
|
||||
type: "token",
|
||||
text: "-",
|
||||
startTime: sentence.timeline[j - 1]?.endTime || 0,
|
||||
endTime: sentence.timeline[j - 1]?.endTime || 0,
|
||||
timeline: [],
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
for (let k = j + 1; k <= sentence.timeline.length - 1; k++) {
|
||||
if (word.includes(sentence.timeline[k].text.toLowerCase())) {
|
||||
let connector = "";
|
||||
if (match[0] === "-") {
|
||||
connector = "-";
|
||||
}
|
||||
token.text = [token.text, sentence.timeline[k].text].join(
|
||||
connector
|
||||
);
|
||||
token.timeline = [
|
||||
...token.timeline,
|
||||
...sentence.timeline[k].timeline,
|
||||
];
|
||||
token.endTime = sentence.timeline[k].endTime;
|
||||
sentence.timeline.splice(k, 1);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
}
|
||||
|
||||
await EnjoyApp.transcriptions.update(transcription.id, {
|
||||
state: "finished",
|
||||
result: {
|
||||
timeline: timeline,
|
||||
transcript: alignmentResult.transcript,
|
||||
originalText,
|
||||
tokenId,
|
||||
},
|
||||
engine,
|
||||
model,
|
||||
const { engine, model, alignmentResult, tokenId } = await transcribe(
|
||||
media.src,
|
||||
{
|
||||
targetId: media.id,
|
||||
targetType: media.mediaType,
|
||||
originalText,
|
||||
language,
|
||||
});
|
||||
|
||||
if (media.language !== language) {
|
||||
if (media.mediaType === "Video") {
|
||||
await EnjoyApp.videos.update(media.id, {
|
||||
language,
|
||||
});
|
||||
} else {
|
||||
await EnjoyApp.audios.update(media.id, {
|
||||
language,
|
||||
});
|
||||
}
|
||||
service,
|
||||
isolate,
|
||||
}
|
||||
);
|
||||
|
||||
let timeline: TimelineEntry[] = [];
|
||||
alignmentResult.timeline.forEach((t) => {
|
||||
if (t.type === "sentence") {
|
||||
timeline.push(t);
|
||||
} else {
|
||||
t.timeline.forEach((st) => {
|
||||
timeline.push(st);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
timeline = preProcessTranscription(timeline);
|
||||
if (media.language !== language) {
|
||||
if (media.mediaType === "Video") {
|
||||
await EnjoyApp.videos.update(media.id, {
|
||||
language,
|
||||
});
|
||||
} else {
|
||||
await EnjoyApp.audios.update(media.id, {
|
||||
language,
|
||||
});
|
||||
}
|
||||
} catch (err) {
|
||||
toast.error(err.message);
|
||||
}
|
||||
|
||||
await EnjoyApp.transcriptions.update(transcription.id, {
|
||||
state: "finished",
|
||||
result: {
|
||||
timeline: timeline,
|
||||
transcript: alignmentResult.transcript,
|
||||
originalText,
|
||||
tokenId,
|
||||
},
|
||||
engine,
|
||||
model,
|
||||
language,
|
||||
});
|
||||
|
||||
setTranscribing(false);
|
||||
};
|
||||
|
||||
const preProcessTranscription = (timeline: TimelineEntry[]) => {
|
||||
/*
|
||||
* Pre-process
|
||||
* 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc
|
||||
* 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display;
|
||||
* 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display;
|
||||
*/
|
||||
try {
|
||||
timeline.forEach((sentence, i) => {
|
||||
const nextSentence = timeline[i + 1];
|
||||
if (
|
||||
!sentence.text
|
||||
.replaceAll(MAGIC_TOKEN_REGEX, "")
|
||||
.match(END_OF_SENTENCE_REGEX) &&
|
||||
nextSentence?.text
|
||||
) {
|
||||
nextSentence.text = [sentence.text, nextSentence.text].join(" ");
|
||||
nextSentence.timeline = [
|
||||
...sentence.timeline,
|
||||
...nextSentence.timeline,
|
||||
];
|
||||
nextSentence.startTime = sentence.startTime;
|
||||
timeline.splice(i, 1);
|
||||
} else {
|
||||
const words = sentence.text.split(" ");
|
||||
|
||||
sentence.timeline.forEach((token, j) => {
|
||||
const word = words[j]?.trim()?.toLowerCase();
|
||||
|
||||
const match = word?.match(/-|%/);
|
||||
if (!match) return;
|
||||
|
||||
if (
|
||||
word === "-" &&
|
||||
token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase()
|
||||
) {
|
||||
sentence.timeline.splice(j, 0, {
|
||||
type: "token",
|
||||
text: "-",
|
||||
startTime: sentence.timeline[j - 1]?.endTime || 0,
|
||||
endTime: sentence.timeline[j - 1]?.endTime || 0,
|
||||
timeline: [],
|
||||
});
|
||||
return;
|
||||
}
|
||||
|
||||
for (let k = j + 1; k <= sentence.timeline.length - 1; k++) {
|
||||
if (word.includes(sentence.timeline[k].text.toLowerCase())) {
|
||||
let connector = "";
|
||||
if (match[0] === "-") {
|
||||
connector = "-";
|
||||
}
|
||||
token.text = [token.text, sentence.timeline[k].text].join(
|
||||
connector
|
||||
);
|
||||
token.timeline = [
|
||||
...token.timeline,
|
||||
...sentence.timeline[k].timeline,
|
||||
];
|
||||
token.endTime = sentence.timeline[k].endTime;
|
||||
sentence.timeline.splice(k, 1);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
} catch (err) {
|
||||
console.warn(err);
|
||||
toast.warning(
|
||||
`Failed to pre-process transcription timeline: ${err.message}`
|
||||
);
|
||||
}
|
||||
return timeline;
|
||||
};
|
||||
|
||||
const findTranscriptionFromWebApi = async () => {
|
||||
if (!transcription) {
|
||||
await findOrCreateTranscription();
|
||||
@@ -252,32 +266,40 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
}, [media]);
|
||||
|
||||
/*
|
||||
* auto-generate transcription result
|
||||
* listen to transcription update
|
||||
*/
|
||||
useEffect(() => {
|
||||
if (!transcription) return;
|
||||
|
||||
addDblistener(onTransactionUpdate);
|
||||
return () => {
|
||||
removeDbListener(onTransactionUpdate);
|
||||
};
|
||||
}, [transcription]);
|
||||
|
||||
// if (
|
||||
// transcription.state == "pending" ||
|
||||
// !transcription.result?.["timeline"]
|
||||
// ) {
|
||||
// findOrGenerateTranscription();
|
||||
// }
|
||||
/*
|
||||
* listen to transcribe progress
|
||||
*/
|
||||
useEffect(() => {
|
||||
if (!transcribing) return;
|
||||
|
||||
if (whisperConfig.service === "local") {
|
||||
if (service === "local") {
|
||||
EnjoyApp.whisper.onProgress((_, p: number) => {
|
||||
if (p > 100) p = 100;
|
||||
setTranscribingProgress(p);
|
||||
});
|
||||
}
|
||||
|
||||
EnjoyApp.app.onCmdOutput((_, output) => {
|
||||
setTranscribingOutput(output);
|
||||
});
|
||||
|
||||
return () => {
|
||||
removeDbListener(onTransactionUpdate);
|
||||
EnjoyApp.whisper.removeProgressListeners();
|
||||
EnjoyApp.app.removeCmdOutputListeners();
|
||||
setTranscribingOutput(null);
|
||||
};
|
||||
}, [transcription, media]);
|
||||
}, [media, service, transcribing]);
|
||||
|
||||
const abortGenerateTranscription = () => {
|
||||
EnjoyApp.whisper.abort();
|
||||
@@ -288,6 +310,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
transcription,
|
||||
transcribingProgress,
|
||||
transcribing,
|
||||
transcribingOutput: output || transcribingOutput,
|
||||
generateTranscription,
|
||||
abortGenerateTranscription,
|
||||
};
|
||||
|
||||
2
enjoy/src/types/enjoy-app.d.ts
vendored
2
enjoy/src/types/enjoy-app.d.ts
vendored
@@ -10,6 +10,8 @@ type EnjoyAppType = {
|
||||
quit: () => Promise<void>;
|
||||
openDevTools: () => Promise<void>;
|
||||
createIssue: (title: string, body: string) => Promise<void>;
|
||||
onCmdOutput: (callback: (event, output: string) => void) => void;
|
||||
removeCmdOutputListeners: () => void;
|
||||
version: string;
|
||||
};
|
||||
window: {
|
||||
|
||||
Reference in New Issue
Block a user