Improve stt (#764)

* refactor

* add isolate option for alignment

* setup isolate for alignment

* refactor transcription form

* add transribing output

* update locale

* refactor

* display transcribing output

* cannot cancel when transcribing
This commit is contained in:
an-lee
2024-07-03 11:19:44 +08:00
committed by GitHub
parent d9534bcae8
commit f440947ea0
12 changed files with 315 additions and 172 deletions

View File

@@ -611,8 +611,11 @@
"assessing": "Assessing",
"assessedSuccessfully": "Assessed successfully",
"optinal": "Optional",
"uploadTranscriptFile": "Upload transcript file(.txt/.srt/.vtt)",
"uploadTranscriptFile": "Upload transcript file",
"uploadTranscriptFileDescription": "Optional. Support formats: txt/srt/vtt.",
"onlyTextFileIsSupported": "Only text file is supported",
"isolateVoice": "Isolate voice",
"isolateVoiceDescription": "Isolates voice from any music or background ambience. More accurate but slower",
"sortBy": "Sort by",
"createdAtDesc": "Created at desc",
"createdAtAsc": "Created at asc",
@@ -627,5 +630,6 @@
"search": "Search",
"noData": "No data",
"selectedFiles": "Selected files",
"moreOptions": "More options"
"moreOptions": "More options",
"lessOptions": "Less options"
}

View File

@@ -611,8 +611,11 @@
"assessing": "正在评估",
"assessedSuccessfully": "评估成功",
"optinal": "可选",
"uploadTranscriptFile": "上传字幕文件(.txt/.srt/.vtt)",
"uploadTranscriptFile": "上传字幕文件",
"uploadTranscriptFileDescription": "可选。支持字幕文件格式: txt/srt/vtt。",
"onlyTextFileIsSupported": "仅支持文本文件",
"isolateVoice": "提取人声",
"isolateVoiceDescription": "将人声从音乐、背景音中隔离,字幕对齐会更准确,但耗时较久。",
"sortBy": "排序",
"createdAtDesc": "创建时间降序",
"createdAtAsc": "创建时间升序",
@@ -627,5 +630,6 @@
"search": "搜索",
"noData": "没有数据",
"selectedFiles": "已选中文件",
"moreOptions": "更多选项"
"moreOptions": "更多选项",
"lessOptions": "更少选项"
}

View File

@@ -431,6 +431,28 @@ ${log}
return { action: "allow" };
});
// Capture stderr & stdout and send them to renderer
const originalStderrWrite = process.stderr.write.bind(process.stderr);
process.stderr.write = (chunk, encoding?, callback?) => {
// Remove ANSI color codes
const output = chunk
.toString()
.replace(/\x1B\[([0-9]{1,3}(;[0-9]{1,2};?)?)?[mGK]/g, "");
mainWindow.webContents.send("app-on-cmd-output", output);
return originalStderrWrite(chunk, encoding, callback);
};
const originalStdoutWrite = process.stdout.write.bind(process.stdout);
process.stdout.write = (chunk, encoding?, callback?) => {
// Remove ANSI color codes
const output = chunk
.toString()
.replace(/\x1B\[([0-9]{1,3}(;[0-9]{1,2};?)?)?[mGK]/g, "");
mainWindow.webContents.send("app-on-cmd-output", output);
return originalStdoutWrite(chunk, encoding, callback);
};
// and load the index.html of the app.
if (MAIN_WINDOW_VITE_DEV_SERVER_URL) {
mainWindow.loadURL(MAIN_WINDOW_VITE_DEV_SERVER_URL);

View File

@@ -2,6 +2,8 @@
// https://www.electronjs.org/docs/latest/tutorial/process-model#preload-scripts
import { contextBridge, ipcRenderer, IpcRendererEvent } from "electron";
import { version } from "../package.json";
import { callback } from "chart.js/dist/helpers/helpers.core";
import { remove } from "lodash";
contextBridge.exposeInMainWorld("__ENJOY_APP__", {
app: {
@@ -35,6 +37,12 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
createIssue: (title: string, body: string) => {
return ipcRenderer.invoke("app-create-issue", title, body);
},
onCmdOutput: (callback: (event: IpcRendererEvent, data: string) => void) => {
ipcRenderer.on("app-on-cmd-output", callback);
},
removeCmdOutputListeners: () => {
ipcRenderer.removeAllListeners("app-on-cmd-output");
},
version,
},
window: {

View File

@@ -64,7 +64,7 @@ export const MediaCurrentRecording = () => {
currentTime: mediaCurrentTime,
} = useContext(MediaPlayerProviderContext);
const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext);
const { enabled, currentHotkeys } = useContext(
const { currentHotkeys } = useContext(
HotKeysSettingsProviderContext
);
const [player, setPlayer] = useState(null);

View File

@@ -9,6 +9,7 @@ import {
AlertDialogContent,
AlertDialogTitle,
AlertDialogDescription,
toast,
} from "@renderer/components/ui";
import { LoaderIcon } from "lucide-react";
import { TranscriptionCreateForm } from "../transcriptions";
@@ -22,6 +23,7 @@ export const MediaTranscriptionGenerateButton = (props: {
transcribing,
transcription,
transcribingProgress,
transcribingOutput,
} = useContext(MediaPlayerProviderContext);
const [open, setOpen] = useState(false);
@@ -62,11 +64,18 @@ export const MediaTranscriptionGenerateButton = (props: {
originalText: data.text,
language: data.language,
service: data.service as WhisperConfigType["service"],
});
setOpen(false);
isolate: data.isolate,
})
.then(() => {
setOpen(false);
})
.catch((e) => {
toast.error(e.message);
});
}}
transcribing={transcribing}
transcribingProgress={transcribingProgress}
transcribingOutput={transcribingOutput}
/>
</AlertDialogContent>
</AlertDialog>

View File

@@ -12,6 +12,7 @@ import {
CollapsibleContent,
CollapsibleTrigger,
Form,
FormDescription,
FormField,
FormItem,
FormLabel,
@@ -24,6 +25,7 @@ import {
SelectItem,
SelectTrigger,
SelectValue,
Switch,
Textarea,
toast,
} from "@renderer/components/ui";
@@ -36,18 +38,21 @@ const transcriptionSchema = z.object({
language: z.string(),
service: z.string(),
text: z.string().optional(),
isolate: z.boolean().optional(),
});
export const TranscriptionCreateForm = (props: {
onSubmit: (data: z.infer<typeof transcriptionSchema>) => void;
originalText?: string;
onCancel?: () => void;
transcribing?: boolean;
transcribingProgress?: number;
transcribing: boolean;
transcribingProgress: number;
transcribingOutput: string;
}) => {
const {
transcribing = false,
transcribingProgress = 0,
transcribingOutput,
onSubmit,
onCancel,
originalText,
@@ -62,6 +67,7 @@ export const TranscriptionCreateForm = (props: {
language: learningLanguage,
service: whisperConfig.service,
text: originalText,
isolate: false,
},
});
@@ -127,7 +133,7 @@ export const TranscriptionCreateForm = (props: {
control={form.control}
name="service"
render={({ field }) => (
<FormItem className="grid w-full items-center gap-1.5">
<FormItem className="grid w-full items-center">
<FormLabel>{t("sttAiService")}</FormLabel>
<Select
disabled={transcribing}
@@ -153,7 +159,7 @@ export const TranscriptionCreateForm = (props: {
control={form.control}
name="language"
render={({ field }) => (
<FormItem className="grid w-full items-center gap-1.5">
<FormItem className="grid w-full items-center">
<FormLabel>{t("language")}</FormLabel>
<Select
disabled={transcribing}
@@ -176,12 +182,12 @@ export const TranscriptionCreateForm = (props: {
)}
/>
<Collapsible open={collapsibleOpen} onOpenChange={setCollapsibleOpen}>
<CollapsibleContent>
<CollapsibleContent className="space-y-4 mb-4">
<FormField
control={form.control}
name="text"
render={({ field }) => (
<FormItem className="grid w-full items-center gap-1.5">
<FormItem className="grid w-full items-center">
<FormLabel>
{t("uploadTranscriptFile")}({t("optinal")})
</FormLabel>
@@ -205,6 +211,9 @@ export const TranscriptionCreateForm = (props: {
}
}}
/>
<FormDescription>
{t("uploadTranscriptFileDescription")}
</FormDescription>
{field.value != undefined && (
<>
<FormLabel>{t("transcript")}</FormLabel>
@@ -219,45 +228,92 @@ export const TranscriptionCreateForm = (props: {
</FormItem>
)}
/>
<FormField
control={form.control}
name="isolate"
render={({ field }) => (
<FormItem className="grid w-full items-center">
<FormLabel>{t("isolateVoice")}</FormLabel>
<Switch
checked={field.value}
onCheckedChange={field.onChange}
disabled={transcribing}
/>
<FormDescription>
{t("isolateVoiceDescription")}
</FormDescription>
</FormItem>
)}
/>
</CollapsibleContent>
<div className="flex justify-center my-4">
<div className="flex justify-center">
<CollapsibleTrigger asChild>
<Button variant="ghost" size="sm">
<span className="">{t("moreOptions")}</span>
{collapsibleOpen ? (
<ChevronUpIcon className="h-4 w-4" />
<>
<ChevronUpIcon className="h-4 w-4" />
<span className="ml-2">{t("lessOptions")}</span>
</>
) : (
<ChevronDownIcon className="h-4 w-4" />
<>
<ChevronDownIcon className="h-4 w-4" />
<span className="ml-2">{t("moreOptions")}</span>
</>
)}
</Button>
</CollapsibleTrigger>
</div>
</Collapsible>
{transcribing && form.watch("service") === "local" && (
<div className="mb-4">
<div className="flex items-center space-x-4 mb-2">
<PingPoint colorClassName="bg-yellow-500" />
<span>{t("transcribing")}</span>
</div>
{whisperConfig.service === "local" && (
<Progress value={transcribingProgress} />
)}
</div>
)}
<TranscribeProgress
service={form.watch("service")}
transcribing={transcribing}
transcribingProgress={transcribingProgress}
transcribingOutput={transcribingOutput}
/>
<div className="flex justify-end space-x-4">
{onCancel && (
{onCancel && !transcribing && (
<Button type="reset" variant="outline" onClick={onCancel}>
{t("cancel")}
</Button>
)}
<Button disabled={transcribing} type="submit" variant="default">
{transcribing && <LoaderIcon className="animate-spin w-4 mr-2" />}
{t("transcribe")}
{t("continue")}
</Button>
</div>
</form>
</Form>
);
};
const TranscribeProgress = (props: {
service: string;
transcribing: boolean;
transcribingProgress: number;
transcribingOutput?: string;
}) => {
const { service, transcribing, transcribingProgress, transcribingOutput } =
props;
if (!transcribing) return null;
return (
<div className="mb-4 space-y-2">
<div className="flex items-center space-x-4 mb-2">
<PingPoint colorClassName="bg-yellow-500" />
<span>{t("transcribing")}</span>
</div>
{service === "local" && transcribingProgress > 0 && (
<Progress value={transcribingProgress} />
)}
{transcribingOutput && (
<div className="max-w-full rounded-lg border bg-zinc-950 p-3 dark:bg-zinc-900 h-20 overflow-y-auto">
<code className="px-[0.3rem] py-[0.2rem] rounded text-muted-foreground font-mono text-xs break-words">
{transcribingOutput}
</code>
</div>
)}
</div>
);
};

View File

@@ -39,14 +39,12 @@ export const TranscriptionEditButton = (props: {
const handleSave = async () => {
setSubmiting(true);
try {
await generateTranscription({ originalText: content });
setOpen(false);
} catch (e) {
toast.error(e.message);
}
setSubmiting(false);
generateTranscription({ originalText: content })
.then(() => setOpen(false))
.catch((e) => {
toast.error(e.message);
})
.finally(() => setSubmiting(false));
};
return (

View File

@@ -69,9 +69,11 @@ type MediaPlayerContextType = {
originalText?: string;
language?: string;
service?: WhisperConfigType["service"];
}) => void;
isolate?: boolean;
}) => Promise<void>;
transcribing: boolean;
transcribingProgress: number;
transcribingOutput: string;
transcriptionDraft: TranscriptionType["result"];
setTranscriptionDraft: (result: TranscriptionType["result"]) => void;
// Recordings
@@ -172,6 +174,7 @@ export const MediaPlayerProvider = ({
generateTranscription,
transcribing,
transcribingProgress,
transcribingOutput,
abortGenerateTranscription,
} = useTranscriptions(media);
@@ -611,6 +614,7 @@ export const MediaPlayerProvider = ({
generateTranscription,
transcribing,
transcribingProgress,
transcribingOutput,
transcriptionDraft,
setTranscriptionDraft,
isRecording,

View File

@@ -3,7 +3,7 @@ import {
AISettingsProviderContext,
} from "@renderer/context";
import OpenAI from "openai";
import { useContext } from "react";
import { useContext, useState } from "react";
import { t } from "i18next";
import { AI_WORKER_ENDPOINT } from "@/constants";
import * as sdk from "microsoft-cognitiveservices-speech-sdk";
@@ -15,6 +15,7 @@ export const useTranscribe = () => {
const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
const { openai } = useContext(AISettingsProviderContext);
const { punctuateText } = useAiCommand();
const [output, setOutput] = useState<string>("");
const transcode = async (src: string | Blob): Promise<string> => {
if (src instanceof Blob) {
@@ -36,6 +37,7 @@ export const useTranscribe = () => {
originalText?: string;
language: string;
service: WhisperConfigType["service"];
isolate?: boolean;
}
): Promise<{
engine: string;
@@ -45,8 +47,14 @@ export const useTranscribe = () => {
tokenId?: number;
}> => {
const url = await transcode(mediaSrc);
const { targetId, targetType, originalText, language, service } =
params || {};
const {
targetId,
targetType,
originalText,
language,
service,
isolate = false,
} = params || {};
const blob = await (await fetch(url)).blob();
let result;
@@ -70,6 +78,8 @@ export const useTranscribe = () => {
throw new Error(t("whisperServiceNotSupported"));
}
setOutput(null);
let transcript = originalText || result.text;
// Remove all content inside `()`, `[]`, `{}` and trim the text
@@ -93,6 +103,7 @@ export const useTranscribe = () => {
transcript,
{
language,
isolate,
}
);
@@ -193,7 +204,8 @@ export const useTranscribe = () => {
return new Promise((resolve, reject) => {
reco.recognizing = (_s, e) => {
console.log(e.result.text);
console.log(e.result);
setOutput(e.result.text);
};
reco.recognized = (_s, e) => {
@@ -230,5 +242,6 @@ export const useTranscribe = () => {
return {
transcode,
transcribe,
output,
};
};

View File

@@ -16,11 +16,17 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
);
const { addDblistener, removeDbListener } = useContext(DbProviderContext);
const [transcription, setTranscription] = useState<TranscriptionType>(null);
const { transcribe } = useTranscribe();
const { transcribe, output } = useTranscribe();
const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
const [transcribing, setTranscribing] = useState<boolean>(false);
const [transcribingOutput, setTranscribingOutput] = useState<string>("");
const [service, setService] = useState<WhisperConfigType["service"]>(
whisperConfig.service
);
const onTransactionUpdate = (event: CustomEvent) => {
if (!transcription) return;
const { model, action, record } = event.detail || {};
if (
model === "Transcription" &&
@@ -58,12 +64,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
originalText?: string;
language?: string;
service?: WhisperConfigType["service"];
isolate?: boolean;
}) => {
let {
originalText,
language = learningLanguage,
service = whisperConfig.service,
isolate = false,
} = params || {};
setService(service);
if (originalText === undefined) {
if (transcription?.targetId === media.id) {
originalText = transcription.result?.originalText;
@@ -77,131 +87,135 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
setTranscribing(true);
setTranscribingProgress(0);
try {
const { engine, model, alignmentResult, tokenId } = await transcribe(
media.src,
{
targetId: media.id,
targetType: media.mediaType,
originalText,
language,
service,
}
);
let timeline: TimelineEntry[] = [];
alignmentResult.timeline.forEach((t) => {
if (t.type === "sentence") {
timeline.push(t);
} else {
t.timeline.forEach((st) => {
timeline.push(st);
});
}
});
/*
* Pre-process
* 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc
* 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display;
* 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display;
*/
try {
timeline.forEach((sentence, i) => {
const nextSentence = timeline[i + 1];
if (
!sentence.text
.replaceAll(MAGIC_TOKEN_REGEX, "")
.match(END_OF_SENTENCE_REGEX) &&
nextSentence?.text
) {
nextSentence.text = [sentence.text, nextSentence.text].join(" ");
nextSentence.timeline = [
...sentence.timeline,
...nextSentence.timeline,
];
nextSentence.startTime = sentence.startTime;
timeline.splice(i, 1);
} else {
const words = sentence.text.split(" ");
sentence.timeline.forEach((token, j) => {
const word = words[j]?.trim()?.toLowerCase();
const match = word?.match(/-|%/);
if (!match) return;
if (
word === "-" &&
token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase()
) {
sentence.timeline.splice(j, 0, {
type: "token",
text: "-",
startTime: sentence.timeline[j - 1]?.endTime || 0,
endTime: sentence.timeline[j - 1]?.endTime || 0,
timeline: [],
});
return;
}
for (let k = j + 1; k <= sentence.timeline.length - 1; k++) {
if (word.includes(sentence.timeline[k].text.toLowerCase())) {
let connector = "";
if (match[0] === "-") {
connector = "-";
}
token.text = [token.text, sentence.timeline[k].text].join(
connector
);
token.timeline = [
...token.timeline,
...sentence.timeline[k].timeline,
];
token.endTime = sentence.timeline[k].endTime;
sentence.timeline.splice(k, 1);
} else {
break;
}
}
});
}
});
} catch (err) {
console.error(err);
}
await EnjoyApp.transcriptions.update(transcription.id, {
state: "finished",
result: {
timeline: timeline,
transcript: alignmentResult.transcript,
originalText,
tokenId,
},
engine,
model,
const { engine, model, alignmentResult, tokenId } = await transcribe(
media.src,
{
targetId: media.id,
targetType: media.mediaType,
originalText,
language,
});
if (media.language !== language) {
if (media.mediaType === "Video") {
await EnjoyApp.videos.update(media.id, {
language,
});
} else {
await EnjoyApp.audios.update(media.id, {
language,
});
}
service,
isolate,
}
);
let timeline: TimelineEntry[] = [];
alignmentResult.timeline.forEach((t) => {
if (t.type === "sentence") {
timeline.push(t);
} else {
t.timeline.forEach((st) => {
timeline.push(st);
});
}
});
timeline = preProcessTranscription(timeline);
if (media.language !== language) {
if (media.mediaType === "Video") {
await EnjoyApp.videos.update(media.id, {
language,
});
} else {
await EnjoyApp.audios.update(media.id, {
language,
});
}
} catch (err) {
toast.error(err.message);
}
await EnjoyApp.transcriptions.update(transcription.id, {
state: "finished",
result: {
timeline: timeline,
transcript: alignmentResult.transcript,
originalText,
tokenId,
},
engine,
model,
language,
});
setTranscribing(false);
};
const preProcessTranscription = (timeline: TimelineEntry[]) => {
/*
* Pre-process
* 1. Some words end with period should not be a single sentence, like Mr./Ms./Dr. etc
* 2. Some words connected by `-`(like scrach-off) are split into multiple words in words timeline, merge them for display;
* 3. Some numbers with `%` are split into `number + percent` in words timeline, merge them for display;
*/
try {
timeline.forEach((sentence, i) => {
const nextSentence = timeline[i + 1];
if (
!sentence.text
.replaceAll(MAGIC_TOKEN_REGEX, "")
.match(END_OF_SENTENCE_REGEX) &&
nextSentence?.text
) {
nextSentence.text = [sentence.text, nextSentence.text].join(" ");
nextSentence.timeline = [
...sentence.timeline,
...nextSentence.timeline,
];
nextSentence.startTime = sentence.startTime;
timeline.splice(i, 1);
} else {
const words = sentence.text.split(" ");
sentence.timeline.forEach((token, j) => {
const word = words[j]?.trim()?.toLowerCase();
const match = word?.match(/-|%/);
if (!match) return;
if (
word === "-" &&
token.text.toLowerCase() === words[j + 1]?.trim()?.toLowerCase()
) {
sentence.timeline.splice(j, 0, {
type: "token",
text: "-",
startTime: sentence.timeline[j - 1]?.endTime || 0,
endTime: sentence.timeline[j - 1]?.endTime || 0,
timeline: [],
});
return;
}
for (let k = j + 1; k <= sentence.timeline.length - 1; k++) {
if (word.includes(sentence.timeline[k].text.toLowerCase())) {
let connector = "";
if (match[0] === "-") {
connector = "-";
}
token.text = [token.text, sentence.timeline[k].text].join(
connector
);
token.timeline = [
...token.timeline,
...sentence.timeline[k].timeline,
];
token.endTime = sentence.timeline[k].endTime;
sentence.timeline.splice(k, 1);
} else {
break;
}
}
});
}
});
} catch (err) {
console.warn(err);
toast.warning(
`Failed to pre-process transcription timeline: ${err.message}`
);
}
return timeline;
};
const findTranscriptionFromWebApi = async () => {
if (!transcription) {
await findOrCreateTranscription();
@@ -252,32 +266,40 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
}, [media]);
/*
* auto-generate transcription result
* listen to transcription update
*/
useEffect(() => {
if (!transcription) return;
addDblistener(onTransactionUpdate);
return () => {
removeDbListener(onTransactionUpdate);
};
}, [transcription]);
// if (
// transcription.state == "pending" ||
// !transcription.result?.["timeline"]
// ) {
// findOrGenerateTranscription();
// }
/*
* listen to transcribe progress
*/
useEffect(() => {
if (!transcribing) return;
if (whisperConfig.service === "local") {
if (service === "local") {
EnjoyApp.whisper.onProgress((_, p: number) => {
if (p > 100) p = 100;
setTranscribingProgress(p);
});
}
EnjoyApp.app.onCmdOutput((_, output) => {
setTranscribingOutput(output);
});
return () => {
removeDbListener(onTransactionUpdate);
EnjoyApp.whisper.removeProgressListeners();
EnjoyApp.app.removeCmdOutputListeners();
setTranscribingOutput(null);
};
}, [transcription, media]);
}, [media, service, transcribing]);
const abortGenerateTranscription = () => {
EnjoyApp.whisper.abort();
@@ -288,6 +310,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
transcription,
transcribingProgress,
transcribing,
transcribingOutput: output || transcribingOutput,
generateTranscription,
abortGenerateTranscription,
};

View File

@@ -10,6 +10,8 @@ type EnjoyAppType = {
quit: () => Promise<void>;
openDevTools: () => Promise<void>;
createIssue: (title: string, body: string) => Promise<void>;
onCmdOutput: (callback: (event, output: string) => void) => void;
removeCmdOutputListeners: () => void;
version: string;
};
window: {