Feat: Improve alignment for the audio with background noise (#870)

* use echogarden.alignSegments

* fix cloudflare whisper

* refactor azure ai transcribe

* refactor

* fix align result

* refactor

* edit transcription in srt format

* improve timeline

* refactor

* fix update current segment index

* validate text when use upload transcript

* add form description

* refactor codes

* do not change sentence timeline based on word timeline
This commit is contained in:
an-lee
2024-07-23 15:24:24 +08:00
committed by GitHub
parent 57403cdf47
commit 078f5159ff
22 changed files with 458 additions and 134 deletions

View File

@@ -242,6 +242,7 @@
"downloadFfmpeg": "Download FFmpeg",
"youAreReadyToGo": "You are ready to go",
"welcomeBack": "Welcome back! {{name}}",
"print": "Print",
"download": "Download",
"downloading": "Downloading {{file}}",
"downloadedSuccessfully": "Downloaded successfully",
@@ -374,6 +375,7 @@
"cloudflareAi": "Cloudflare AI",
"cloudflareSpeechToTextDescription": "Use Cloudflare AI Worker to transcribe. It is in beta and free for now.",
"openaiSpeechToTextDescription": "Use openAI to transcribe using your own key.",
"uploadSpeechToTextDescription": "Upload transcript file or input transcript text to align.",
"checkingWhisper": "Checking whisper status",
"pleaseDownloadWhisperModelFirst": "Please download whisper model first",
"whisperIsWorkingGood": "Whisper is working good",
@@ -618,7 +620,8 @@
"assessedSuccessfully": "Assessed successfully",
"optinal": "Optional",
"uploadTranscriptFile": "Upload transcript file",
"uploadTranscriptFileDescription": "Optional. Support formats: txt/srt/vtt.",
"uploadTranscriptFileDescription": "Support formats: txt/srt/vtt.",
"pleaseUploadTranscriptFile": "Please upload transcript file",
"onlyTextFileIsSupported": "Only text file is supported",
"isolateVoice": "Isolate voice(Experimental)",
"isolateVoiceDescription": "Isolates voice from any music or background ambience. More accurate but slower",

View File

@@ -242,6 +242,7 @@
"downloadFfmpeg": "下载 FFmpeg",
"youAreReadyToGo": "您已准备就绪",
"welcomeBack": "欢迎回来, {{name}}",
"print": "打印",
"download": "下载",
"downloading": "正在下载 {{file}}",
"downloadedSuccessfully": "下载成功",
@@ -374,6 +375,7 @@
"cloudflareAi": "Cloudflare AI",
"cloudflareSpeechToTextDescription": "使用 Cloudflare AI 进行语音转文本,目前免费",
"openaiSpeechToTextDescription": "使用 OpenAI 进行语音转文本(需要 API 密钥)",
"uploadSpeechToTextDescription": "上传字幕文件或者输入文本进行字幕对齐",
"checkingWhisper": "正在检查 Whisper",
"pleaseDownloadWhisperModelFirst": "请先下载 Whisper 模型",
"whisperIsWorkingGood": "Whisper 正常工作",
@@ -618,7 +620,8 @@
"assessedSuccessfully": "评估成功",
"optinal": "可选",
"uploadTranscriptFile": "上传字幕文件",
"uploadTranscriptFileDescription": "可选。支持字幕文件格式: txt/srt/vtt。",
"uploadTranscriptFileDescription": "支持字幕文件格式: txt/srt/vtt。",
"pleaseUploadTranscriptFile": "请上传字幕文件",
"onlyTextFileIsSupported": "仅支持文本文件",
"isolateVoice": "提取人声(实验性)",
"isolateVoiceDescription": "将人声从音乐、背景音中隔离,字幕对齐会更准确,但耗时较久。",

View File

@@ -1,7 +1,6 @@
import { ipcMain } from "electron";
import * as Echogarden from "echogarden/dist/api/API.js";
import { AlignmentOptions } from "echogarden/dist/api/API";
import { AudioSourceParam } from "echogarden/dist/audio/AudioUtilities";
import {
encodeRawAudioToWave,
decodeWaveToRawAudio,
@@ -9,7 +8,9 @@ import {
getRawAudioDuration,
trimAudioStart,
trimAudioEnd,
AudioSourceParam,
} from "echogarden/dist/audio/AudioUtilities.js";
import { Timeline } from "echogarden/dist/utilities/Timeline.d.js";
import path from "path";
import log from "@main/logger";
import url from "url";
@@ -34,6 +35,7 @@ const __dirname = path
const logger = log.scope("echogarden");
class EchogardenWrapper {
public align: typeof Echogarden.align;
public alignSegments: typeof Echogarden.alignSegments;
public denoise: typeof Echogarden.denoise;
public encodeRawAudioToWave: typeof encodeRawAudioToWave;
public decodeWaveToRawAudio: typeof decodeWaveToRawAudio;
@@ -44,6 +46,7 @@ class EchogardenWrapper {
constructor() {
this.align = Echogarden.align;
this.alignSegments = Echogarden.alignSegments;
this.denoise = Echogarden.denoise;
this.encodeRawAudioToWave = encodeRawAudioToWave;
this.decodeWaveToRawAudio = decodeWaveToRawAudio;
@@ -110,6 +113,25 @@ class EchogardenWrapper {
}
);
ipcMain.handle(
"echogarden-align-segments",
async (
_event,
input: AudioSourceParam,
timeline: Timeline,
options: AlignmentOptions
) => {
logger.debug("echogarden-align-segments:", timeline, options);
try {
const rawAudio = await this.ensureRawAudio(input, 16000);
return await this.alignSegments(rawAudio, timeline, options);
} catch (err) {
logger.error(err);
throw err;
}
}
);
ipcMain.handle(
"echogarden-transcode",
async (_event, url: string, sampleRate?: number) => {

View File

@@ -94,7 +94,7 @@ const userDataPath = () => {
const apiUrl = () => {
const url: string = settings.getSync("apiUrl") as string;
return process.env.API_URL || url || WEB_API_URL;
return process.env.WEB_API_URL || url || WEB_API_URL;
};
export default {

View File

@@ -105,6 +105,8 @@ class Whipser {
`--model "${model.savePath}"`,
"--output-json",
`--output-file "${path.join(tmpDir, "jfk")}"`,
`--split-on-word true`,
`--max-len 1`,
];
logger.debug(`Checking whisper command: ${commands.join(" ")}`);
exec(
@@ -203,6 +205,9 @@ class Whipser {
"--print-progress",
"--language",
model.name.includes("en") ? "en" : language?.split("-")?.[0] || "auto",
`--split-on-word`,
`--max-len`,
"1",
...extra,
];

View File

@@ -2,6 +2,7 @@
// https://www.electronjs.org/docs/latest/tutorial/process-model#preload-scripts
import { contextBridge, ipcRenderer, IpcRendererEvent } from "electron";
import { version } from "../package.json";
import { Timeline } from "echogarden/dist/utilities/Timeline";
contextBridge.exposeInMainWorld("__ENJOY_APP__", {
app: {
@@ -439,6 +440,9 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
align: (input: string, transcript: string, options: any) => {
return ipcRenderer.invoke("echogarden-align", input, transcript, options);
},
alignSegments: (input: string, timeline: Timeline, options: any) => {
return ipcRenderer.invoke("echogarden-align-segments", input, timeline, options);
},
transcode: (input: string) => {
return ipcRenderer.invoke("echogarden-transcode", input);
},

View File

@@ -12,4 +12,4 @@ export * from "./media-provider";
export * from "./media-tabs";
export * from "./media-loading-modal";
export * from "./add-media-button";
export * from "./media-transcription-download";
export * from "./media-transcription-print";

View File

@@ -246,7 +246,6 @@ export const MediaCaption = () => {
if (index < 0) return;
if (index !== activeIndex) {
console.log("setActiveIndex", index);
setActiveIndex(index);
}
}, [currentTime, caption]);
@@ -509,8 +508,8 @@ export const Caption = (props: {
let words = caption.text.split(" ");
const ipas = caption.timeline.map((w) =>
w.timeline.map((t) =>
language.startsWith("en")
w.timeline?.map((t) =>
t.timeline && language.startsWith("en")
? convertWordIpaToNormal(
t.timeline.map((s) => s.text),
{ mappings: ipaMappings }

View File

@@ -5,7 +5,7 @@ import {
} from "@renderer/context";
import { TabsContent, Separator } from "@renderer/components/ui";
import { t } from "i18next";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
import { convertWordIpaToNormal } from "@/utils";
import {
CamdictLookupResult,
@@ -41,7 +41,9 @@ const SelectedWords = (props: {
const { selectedIndices, caption } = props;
const { transcription } = useContext(MediaPlayerProviderContext);
const { learningLanguage, ipaMappings } = useContext(AppSettingsProviderContext);
const { learningLanguage, ipaMappings } = useContext(
AppSettingsProviderContext
);
const word = selectedIndices
.map((index) => caption.timeline[index]?.text || "")

View File

@@ -34,7 +34,7 @@ import { useHotkeys } from "react-hotkeys-hook";
import cloneDeep from "lodash/cloneDeep";
import debounce from "lodash/debounce";
import { AlignmentResult } from "echogarden/dist/api/API.d.js";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
const PLAYBACK_RATE_OPTIONS = [0.75, 0.8, 0.9, 1.0];
export const MediaPlayerControls = () => {
@@ -57,7 +57,7 @@ export const MediaPlayerControls = () => {
setTranscriptionDraft,
} = useContext(MediaPlayerProviderContext);
const { EnjoyApp } = useContext(AppSettingsProviderContext);
const { currentHotkeys, enabled } = useContext(
const { currentHotkeys } = useContext(
HotKeysSettingsProviderContext
);
const [playMode, setPlayMode] = useState<"loop" | "single" | "all">("single");

View File

@@ -76,7 +76,9 @@ export const MediaTranscriptionGenerateButton = (props: {
generateTranscription({
originalText: data.text,
language: data.language,
service: data.service as WhisperConfigType["service"],
service: data.service as
| WhisperConfigType["service"]
| "upload",
isolate: data.isolate,
})
.then(() => {

View File

@@ -9,7 +9,7 @@ import { AlignmentResult } from "echogarden/dist/api/API.d.js";
import { convertWordIpaToNormal } from "@/utils";
import template from "./transcription.template.html?raw";
export const MediaTranscriptionDownload = () => {
export const MediaTranscriptionPrint = () => {
const { media, transcription } = useContext(MediaPlayerProviderContext);
const { EnjoyApp, learningLanguage, ipaMappings } = useContext(
AppSettingsProviderContext
@@ -59,7 +59,7 @@ export const MediaTranscriptionDownload = () => {
async function download() {
try {
const savePath = await EnjoyApp.dialog.showSaveDialog({
title: t("download"),
title: t("print"),
defaultPath: `${media.name}.pdf`,
});
@@ -75,7 +75,7 @@ export const MediaTranscriptionDownload = () => {
return (
<Button variant="ghost" className="block w-full" onClick={download}>
{t("download")}
{t("print")}
</Button>
);
};

View File

@@ -28,7 +28,7 @@ import {
SheetHeader,
toast,
} from "@renderer/components/ui";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
import { t } from "i18next";
import WaveSurfer from "wavesurfer.js";
import {

View File

@@ -26,7 +26,7 @@ import { formatDuration } from "@renderer/lib/utils";
import {
MediaTranscriptionReadButton,
MediaTranscriptionGenerateButton,
MediaTranscriptionDownload,
MediaTranscriptionPrint,
TranscriptionEditButton,
} from "@renderer/components";
@@ -165,7 +165,7 @@ export const MediaTranscription = (props: { display?: boolean }) => {
</TranscriptionEditButton>
</DropdownMenuItem>
<DropdownMenuItem asChild>
<MediaTranscriptionDownload />
<MediaTranscriptionPrint />
</DropdownMenuItem>
</DropdownMenuContent>
</DropdownMenu>

View File

@@ -1,4 +1,4 @@
import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
import { useContext, useState } from "react";
import { WavesurferPlayer } from "@/renderer/components/misc";
import { AppSettingsProviderContext } from "@/renderer/context";

View File

@@ -3,14 +3,11 @@ import {
AppSettingsProviderContext,
} from "@renderer/context";
import { zodResolver } from "@hookform/resolvers/zod";
import { useContext, useState } from "react";
import { useContext } from "react";
import { useForm } from "react-hook-form";
import { z } from "zod";
import {
Button,
Collapsible,
CollapsibleContent,
CollapsibleTrigger,
Form,
FormDescription,
FormField,
@@ -31,8 +28,9 @@ import {
} from "@renderer/components/ui";
import { t } from "i18next";
import { LANGUAGES } from "@/constants";
import { ChevronDownIcon, ChevronUpIcon, LoaderIcon } from "lucide-react";
import { LoaderIcon } from "lucide-react";
import { parseText } from "media-captions";
import { milisecondsToTimestamp } from "@/utils";
const transcriptionSchema = z.object({
language: z.string(),
@@ -59,18 +57,28 @@ export const TranscriptionCreateForm = (props: {
} = props;
const { learningLanguage } = useContext(AppSettingsProviderContext);
const { whisperConfig } = useContext(AISettingsProviderContext);
const [collapsibleOpen, setCollapsibleOpen] = useState(false);
const form = useForm<z.infer<typeof transcriptionSchema>>({
resolver: zodResolver(transcriptionSchema),
values: {
language: learningLanguage,
service: whisperConfig.service,
service: originalText ? "upload" : whisperConfig.service,
text: originalText,
isolate: false,
},
});
const handleSubmit = (data: z.infer<typeof transcriptionSchema>) => {
const { service, text } = data;
if (service === "upload" && !text) {
toast.error(t("pleaseUploadTranscriptFile"));
return;
}
onSubmit(data);
};
const parseSubtitle = (file: File) => {
const fileType = file.name.split(".").pop();
return new Promise<string>((resolve, reject) => {
@@ -88,7 +96,16 @@ export const TranscriptionCreateForm = (props: {
if (caption.cues.length === 0) {
text = cleanSubtitleText(text as string);
} else {
text = caption.cues.map((cue) => cue.text).join("\n");
// Write cues to text in SRT format
text = caption.cues
.map((cue, _) => {
return `${milisecondsToTimestamp(
cue.startTime * 1000
)} --> ${milisecondsToTimestamp(cue.endTime * 1000)}\n${
cue.text
}`;
})
.join("\n\n");
}
if (text.length === 0) {
@@ -126,7 +143,7 @@ export const TranscriptionCreateForm = (props: {
return (
<Form {...form}>
<form
onSubmit={form.handleSubmit(onSubmit)}
onSubmit={form.handleSubmit(handleSubmit)}
className="gap-4 grid w-full"
>
<FormField
@@ -150,8 +167,21 @@ export const TranscriptionCreateForm = (props: {
{t("cloudflareAi")}
</SelectItem>
<SelectItem value="openai">OpenAI</SelectItem>
<SelectItem value="upload">{t("upload")}</SelectItem>
</SelectContent>
</Select>
<FormDescription>
{form.watch("service") === "local" &&
t("localSpeechToTextDescription")}
{form.watch("service") === "azure" &&
t("azureSpeechToTextDescription")}
{form.watch("service") === "cloudflare" &&
t("cloudflareSpeechToTextDescription")}
{form.watch("service") === "openai" &&
t("openaiSpeechToTextDescription")}
{form.watch("service") === "upload" &&
t("uploadSpeechToTextDescription")}
</FormDescription>
</FormItem>
)}
/>
@@ -181,16 +211,14 @@ export const TranscriptionCreateForm = (props: {
</FormItem>
)}
/>
<Collapsible open={collapsibleOpen} onOpenChange={setCollapsibleOpen}>
<CollapsibleContent className="space-y-4 mb-4">
{form.watch("service") === "upload" && (
<>
<FormField
control={form.control}
name="text"
render={({ field }) => (
<FormItem className="grid w-full items-center">
<FormLabel>
{t("uploadTranscriptFile")}({t("optinal")})
</FormLabel>
<FormLabel>{t("uploadTranscriptFile")}</FormLabel>
<Input
disabled={transcribing}
type="file"
@@ -245,25 +273,8 @@ export const TranscriptionCreateForm = (props: {
</FormItem>
)}
/>
</CollapsibleContent>
<div className="flex justify-center">
<CollapsibleTrigger asChild>
<Button variant="ghost" size="sm">
{collapsibleOpen ? (
<>
<ChevronUpIcon className="h-4 w-4" />
<span className="ml-2">{t("lessOptions")}</span>
</>
) : (
<>
<ChevronDownIcon className="h-4 w-4" />
<span className="ml-2">{t("moreOptions")}</span>
</>
)}
</Button>
</CollapsibleTrigger>
</div>
</Collapsible>
</>
)}
<TranscribeProgress
service={form.watch("service")}

View File

@@ -20,26 +20,36 @@ import {
Textarea,
toast,
} from "@renderer/components/ui";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
import { t } from "i18next";
import { useContext, useState } from "react";
import { useContext, useEffect, useState } from "react";
import { LoaderIcon } from "lucide-react";
import { milisecondsToTimestamp } from "@/utils";
export const TranscriptionEditButton = (props: {
children?: React.ReactNode;
}) => {
const [open, setOpen] = useState(false);
const [submiting, setSubmiting] = useState(false);
const { transcription, generateTranscription } = useContext(
const { media, transcription, generateTranscription } = useContext(
MediaPlayerProviderContext
);
const [open, setOpen] = useState(false);
const [submiting, setSubmiting] = useState(false);
const [content, setContent] = useState<string>(
transcription.result.timeline.map((t: TimelineEntry) => t.text).join("\n\n")
// generate text in SRT format from timeline entries
transcription.result.timeline
.map(
(t: TimelineEntry) =>
`${milisecondsToTimestamp(
t.startTime * 1000
)} --> ${milisecondsToTimestamp(t.endTime * 1000)}\n${t.text}`
)
.join("\n\n")
);
const [downloadUrl, setDownloadUrl] = useState<string>();
const handleSave = async () => {
setSubmiting(true);
generateTranscription({ originalText: content })
generateTranscription({ originalText: content, service: "upload" })
.then(() => setOpen(false))
.catch((e) => {
toast.error(e.message);
@@ -47,6 +57,13 @@ export const TranscriptionEditButton = (props: {
.finally(() => setSubmiting(false));
};
useEffect(() => {
if (!content) return;
const blob = new Blob([content], { type: "text/html" });
setDownloadUrl(URL.createObjectURL(blob));
}, [content]);
return (
<Dialog open={open} onOpenChange={setOpen}>
<DialogTrigger asChild>
@@ -76,6 +93,11 @@ export const TranscriptionEditButton = (props: {
{t("cancel")}
</Button>
</DialogClose>
<DialogClose asChild>
<a download={`${media.name}.srt`} href={downloadUrl}>
<Button variant="secondary">{t("download")}</Button>
</a>
</DialogClose>
<AlertDialog>
<AlertDialogTrigger asChild>

View File

@@ -68,7 +68,7 @@ type MediaPlayerContextType = {
generateTranscription: (params?: {
originalText?: string;
language?: string;
service?: WhisperConfigType["service"];
service?: WhisperConfigType["service"] | "upload";
isolate?: boolean;
}) => Promise<void>;
transcribing: boolean;
@@ -352,7 +352,7 @@ export const MediaPlayerProvider = ({
let phones: TimelineEntry[] = [];
words.forEach((word: TimelineEntry) => {
word.timeline.forEach((token: TimelineEntry) => {
word.timeline?.forEach((token: TimelineEntry) => {
phones = phones.concat(token.timeline);
});
});

View File

@@ -8,9 +8,92 @@ import { t } from "i18next";
import { AI_WORKER_ENDPOINT } from "@/constants";
import * as sdk from "microsoft-cognitiveservices-speech-sdk";
import axios from "axios";
import { AlignmentResult } from "echogarden/dist/api/API.d.js";
import { useAiCommand } from "./use-ai-command";
import { toast } from "@renderer/components/ui";
import {
Timeline,
TimelineEntry,
type TimelineEntryType,
} from "echogarden/dist/utilities/Timeline";
import take from "lodash/take";
import sortedUniqBy from "lodash/sortedUniqBy";
import { parseText } from "media-captions";
/*
* define the regex pattern to match the end of a sentence
* the end of a sentence is defined as a period, question mark, or exclamation mark
* also it may be followed by a quotation mark
* and exclude sepecial cases like "Mr.", "Mrs.", "Dr.", "Ms.", "etc."
*/
const sentenceEndPattern = /(?<!Mr|Mrs|Dr|Ms|etc)\.|\?|!\"?/;
// test a text string has any punctuations or not
// some transcribed text may not have any punctuations
const punctuationsPattern = /\w[.,!?](\s|$)/g;
/*
* convert the word timeline to sentence timeline
* a sentence is a group of words that ends with a punctuation
*/
const wordTimelineToSentenceTimeline = (
wordTimeline: TimelineEntry[]
): TimelineEntry[] => {
const timeline: TimelineEntry[] = [];
wordTimeline.forEach((word, index) => {
word.text = word.text.trim();
// skip empty words
if (!word.text) return;
// skip music or sound effects quoted in []
if (word.text.match(/^\[.*\]$/)) return;
const wordEntry = {
type: "word" as TimelineEntryType,
text: word.text,
startTime: word.startTime,
endTime: word.endTime,
};
let sentence: TimelineEntry;
// get the last sentence in the timeline
if (timeline.length > 0) {
sentence = timeline[timeline.length - 1];
}
// if there is no sentence in the timeline, create a new sentence
// if last sentence is a punctuation, create a new sentence
if (!sentence || sentence.text.match(sentenceEndPattern)) {
sentence = {
type: "sentence" as TimelineEntryType,
text: "",
startTime: wordEntry.startTime,
endTime: wordEntry.endTime,
timeline: [],
};
timeline.push(sentence);
}
// if the word is a punctuation, add it to the sentence and start a new sentence
if (wordEntry.text.match(sentenceEndPattern)) {
sentence.text += wordEntry.text;
sentence.endTime = wordEntry.endTime;
const lastSentence = timeline[timeline.length - 1];
if (lastSentence.endTime !== sentence.endTime) {
timeline.push(sentence);
}
} else {
sentence.text += wordEntry.text + " ";
sentence.endTime = wordEntry.endTime;
if (index === wordTimeline.length - 1) {
timeline.push(sentence);
}
}
});
return timeline;
};
export const useTranscribe = () => {
const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
@@ -37,13 +120,14 @@ export const useTranscribe = () => {
targetType?: string;
originalText?: string;
language: string;
service: WhisperConfigType["service"];
service: WhisperConfigType["service"] | "upload";
isolate?: boolean;
}
): Promise<{
engine: string;
model: string;
alignmentResult: AlignmentResult;
transcript: string;
timeline: TimelineEntry[];
originalText?: string;
tokenId?: number;
}> => {
@@ -58,67 +142,152 @@ export const useTranscribe = () => {
} = params || {};
const blob = await (await fetch(url)).blob();
let result;
if (originalText) {
result = {
engine: "original",
model: "original",
};
let result: any;
let timeline: Timeline = [];
if (service === "upload" && originalText) {
const caption = await parseText(originalText, { type: "srt" });
if (caption.cues.length > 0) {
timeline = caption.cues.map((cue) => {
return {
type: "sentence",
text: cue.text,
startTime: cue.startTime,
endTime: cue.endTime,
timeline: [],
};
});
result = {
engine: "upload",
model: "-",
text: timeline.map((entry) => entry.text).join(" "),
timeline,
};
} else {
result = {
engine: "upload",
model: "-",
text: originalText,
};
}
} else if (service === "local") {
result = await transcribeByLocal(url, language);
} else if (service === "cloudflare") {
result = await transcribeByCloudflareAi(blob);
} else if (service === "openai") {
result = await transcribeByOpenAi(blob);
result = await transcribeByOpenAi(
new File([blob], "audio.mp3", { type: "audio/mp3" })
);
} else if (service === "azure") {
result = await transcribeByAzureAi(blob, language, {
targetId,
targetType,
});
result = await transcribeByAzureAi(
new File([blob], "audio.wav", { type: "audio/wav" }),
language,
{
targetId,
targetType,
}
);
} else {
throw new Error(t("whisperServiceNotSupported"));
}
let transcript = result.text;
setOutput(null);
/*
* if timeline is available and the transcript contains punctuations
* use `alignSegments` to align each sentence with the timeline
* otherwise, use `align` to align the whole transcript
* if the transcript does not contain any punctuation, use AI command to add punctuation
*/
if (result.timeline?.length && transcript.match(punctuationsPattern)) {
timeline = [...result.timeline];
setOutput("Aligning the transcript...");
const wordTimeline = await EnjoyApp.echogarden.alignSegments(
new Uint8Array(await blob.arrayBuffer()),
timeline,
{
language,
isolate,
}
);
let transcript = originalText || result.text;
wordTimeline.forEach((word: TimelineEntry) => {
let sentence = timeline.find(
(entry) =>
word.startTime >= entry.startTime && word.endTime <= entry.endTime
);
// Remove all content inside `()`, `[]`, `{}` and trim the text
// remove all markdown formatting
transcript = transcript
.replace(/\(.*?\)/g, "")
.replace(/\[.*?\]/g, "")
.replace(/\{.*?\}/g, "")
.replace(/[*_`]/g, "")
.trim();
if (sentence) {
sentence.timeline.push(word);
}
});
// if the transcript does not contain any punctuation, use AI command to add punctuation
if (!transcript.match(/\w[.,!?](\s|$)/)) {
try {
transcript = await punctuateText(transcript);
} catch (err) {
toast.error(err.message);
console.warn(err.message);
/*
* the start time of a sentence should be the start time of the first word in the sentence
* the end time of a sentence should the end time of the last word in the sentence
*/
// timeline.forEach((t) => {
// if (t.timeline.length === 0) return;
// t.startTime = t.timeline[0].startTime;
// t.endTime = t.timeline[t.timeline.length - 1].endTime;
// });
} else {
// Remove all content inside `()`, `[]`, `{}` and trim the text
// remove all markdown formatting
transcript = transcript
.replace(/\(.*?\)/g, "")
.replace(/\[.*?\]/g, "")
.replace(/\{.*?\}/g, "")
.replace(/[*_`]/g, "")
.trim();
// if the transcript does not contain any punctuation, use AI command to add punctuation
if (!transcript.match(punctuationsPattern)) {
try {
transcript = await punctuateText(transcript);
} catch (err) {
toast.error(err.message);
console.warn(err.message);
}
}
setOutput("Aligning the transcript...");
const alignmentResult = await EnjoyApp.echogarden.align(
new Uint8Array(await blob.arrayBuffer()),
transcript,
{
language,
isolate,
}
);
alignmentResult.timeline.forEach((t: TimelineEntry) => {
if (t.type === "sentence") {
timeline.push(t);
} else {
t.timeline.forEach((st) => {
timeline.push(st);
});
}
});
}
const alignmentResult = await EnjoyApp.echogarden.align(
new Uint8Array(await blob.arrayBuffer()),
transcript,
{
language,
isolate,
}
);
return {
...result,
originalText,
alignmentResult,
transcript,
timeline,
};
};
const transcribeByLocal = async (url: string, language?: string) => {
const transcribeByLocal = async (
url: string,
language?: string
): Promise<{
engine: string;
model: string;
text: string;
timeline: TimelineEntry[];
}> => {
const res = await EnjoyApp.whisper.transcribe(
{
file: url,
@@ -130,14 +299,25 @@ export const useTranscribe = () => {
}
);
const wordTimeline: TimelineEntry[] = res.transcription.map((word) => {
return {
type: "word" as TimelineEntryType,
text: word.text,
startTime: word.offsets.from / 1000.0,
endTime: word.offsets.to / 1000.0,
};
});
const timeline = wordTimelineToSentenceTimeline(wordTimeline);
return {
engine: "whisper",
model: res.model.type,
text: res.transcription.map((segment) => segment.text).join(" "),
timeline,
};
};
const transcribeByOpenAi = async (blob: Blob) => {
const transcribeByOpenAi = async (file: File) => {
if (!openai?.key) {
throw new Error(t("openaiKeyRequired"));
}
@@ -149,20 +329,58 @@ export const useTranscribe = () => {
maxRetries: 0,
});
const res: { text: string } = (await client.audio.transcriptions.create({
file: new File([blob], "audio.wav"),
const res: {
text: string;
words?: { word: string; start: number; end: number }[];
segments?: { text: string; start: number; end: number }[];
} = (await client.audio.transcriptions.create({
file,
model: "whisper-1",
response_format: "json",
response_format: "verbose_json",
timestamp_granularities: ["word"],
})) as any;
let timeline: TimelineEntry[] = [];
if (res.segments) {
res.segments.forEach((segment) => {
const segmentTimeline = {
type: "sentence" as TimelineEntryType,
text: segment.text,
startTime: segment.start,
endTime: segment.end,
timeline: [] as Timeline,
};
timeline.push(segmentTimeline);
});
} else if (res.words) {
const wordTimeline = res.words.map((word) => {
return {
type: "word" as TimelineEntryType,
text: word.word,
startTime: word.start,
endTime: word.end,
};
});
timeline = wordTimelineToSentenceTimeline(wordTimeline);
}
return {
engine: "openai",
model: "whisper-1",
text: res.text,
timeline,
};
};
const transcribeByCloudflareAi = async (blob: Blob) => {
const transcribeByCloudflareAi = async (
blob: Blob
): Promise<{
engine: string;
model: string;
text: string;
timeline?: TimelineEntry[];
}> => {
const res: CfWhipserOutputType = (
await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, blob, {
headers: {
@@ -172,15 +390,26 @@ export const useTranscribe = () => {
})
).data;
const wordTimeline = res.words.map((word) => {
return {
type: "word" as TimelineEntryType,
text: word.word,
startTime: word.start,
endTime: word.end,
};
});
const timeline = wordTimelineToSentenceTimeline(wordTimeline);
return {
engine: "cloudflare",
model: "@cf/openai/whisper",
text: res.text,
timeline,
};
};
const transcribeByAzureAi = async (
blob: Blob,
file: File,
language: string,
params?: {
targetId?: string;
@@ -191,12 +420,11 @@ export const useTranscribe = () => {
model: string;
text: string;
tokenId: number;
timeline?: TimelineEntry[];
}> => {
const { id, token, region } = await webApi.generateSpeechToken(params);
const config = sdk.SpeechConfig.fromAuthorizationToken(token, region);
const audioConfig = sdk.AudioConfig.fromWavFileInput(
new File([blob], "audio.wav")
);
const audioConfig = sdk.AudioConfig.fromWavFileInput(file);
// setting the recognition language to learning language, such as 'en-US'.
config.speechRecognitionLanguage = language;
config.requestWordLevelTimestamps();
@@ -209,7 +437,6 @@ export const useTranscribe = () => {
return new Promise((resolve, reject) => {
reco.recognizing = (_s, e) => {
console.log(e.result);
setOutput(e.result.text);
};
@@ -232,10 +459,40 @@ export const useTranscribe = () => {
reco.sessionStopped = (_s, _e) => {
reco.stopContinuousRecognitionAsync();
const wordTimeline: TimelineEntry[] = [];
results.forEach((result) => {
const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
const splitedWords = best.Display.trim().split(" ");
best.Words.forEach((word, index) => {
let text = word.Word;
if (splitedWords.length === best.Words.length) {
text = splitedWords[index];
}
if (
index === best.Words.length - 1 &&
!text.trim().match(sentenceEndPattern)
) {
text = text + ".";
}
wordTimeline.push({
type: "word" as TimelineEntryType,
text,
startTime: word.Offset / 10000000.0,
endTime: (word.Offset + word.Duration) / 10000000.0,
});
});
});
const timeline = wordTimelineToSentenceTimeline(wordTimeline);
resolve({
engine: "azure",
model: "whisper",
text: results.map((result) => result.DisplayText).join(" "),
timeline,
tokenId: id,
});
};

View File

@@ -20,9 +20,9 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
const [transcribingProgress, setTranscribingProgress] = useState<number>(0);
const [transcribing, setTranscribing] = useState<boolean>(false);
const [transcribingOutput, setTranscribingOutput] = useState<string>("");
const [service, setService] = useState<WhisperConfigType["service"]>(
whisperConfig.service
);
const [service, setService] = useState<
WhisperConfigType["service"] | "upload"
>(whisperConfig.service);
const onTransactionUpdate = (event: CustomEvent) => {
if (!transcription) return;
@@ -63,7 +63,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
const generateTranscription = async (params?: {
originalText?: string;
language?: string;
service?: WhisperConfigType["service"];
service?: WhisperConfigType["service"] | "upload";
isolate?: boolean;
}) => {
let {
@@ -87,7 +87,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
}
}
}
const { engine, model, alignmentResult, tokenId } = await transcribe(
const { engine, model, transcript, timeline, tokenId } = await transcribe(
media.src,
{
targetId: media.id,
@@ -99,18 +99,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
}
);
let timeline: TimelineEntry[] = [];
alignmentResult.timeline.forEach((t) => {
if (t.type === "sentence") {
timeline.push(t);
} else {
t.timeline.forEach((st) => {
timeline.push(st);
});
}
});
timeline = preProcessTranscription(timeline);
const processedTimeline = preProcessTranscription(timeline);
if (media.language !== language) {
if (media.mediaType === "Video") {
await EnjoyApp.videos.update(media.id, {
@@ -126,8 +115,8 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
await EnjoyApp.transcriptions.update(transcription.id, {
state: "finished",
result: {
timeline: timeline,
transcript: alignmentResult.transcript,
timeline: processedTimeline,
transcript,
originalText,
tokenId,
},

View File

@@ -252,6 +252,11 @@ type EnjoyAppType = {
transcript: string,
options?: any
) => Promise<AlignmentResult>;
alignSegments: (
input: string | Uint8Array,
timeline: Timeline,
options?: any
) => Promise<Timeline>;
transcode: (input: string) => Promise<string>;
check: () => Promise<boolean>;
};

View File

@@ -49,7 +49,7 @@ export function milisecondsToTimestamp(ms: number) {
const hours = Math.floor(ms / 3600000).toString();
const minutes = Math.floor((ms % 3600000) / 60000).toString();
const seconds = Math.floor(((ms % 360000) % 60000) / 1000).toString();
const milliseconds = Math.floor(((ms % 360000) % 60000) % 1000).toString();
const milliseconds = Math.round(((ms % 360000) % 60000) % 1000).toString();
return `${hours.padStart(2, "0")}:${minutes.padStart(
2,
"0"