Feat: transcribe from web (#204)

* add transcribe from web

* transcribe from web

* add azure speech ai

* fix azure speech output

* may select stt service

* fix UI

* remove debug code

* lint

* fix default stt service

* tweak

* fix secondsToTimestamp
This commit is contained in:
an-lee
2024-01-27 00:45:06 +08:00
committed by GitHub
parent 99577c5020
commit cec9d73bc8
18 changed files with 404 additions and 81 deletions

View File

@@ -2,9 +2,10 @@ export const DATABASE_NAME = "enjoy_database";
export const LIBRARY_PATH_SUFFIX = "EnjoyLibrary";
export const STORAGE_WORKER_ENDPOINT = "https://enjoy-storage.baizhiheizi.com";
export const AI_WORKER_ENDPOINT = "https://enjoy-ai.baizhiheizi.com";
export const WEB_API_URL = "https://enjoy-web.fly.dev";
export const REPO_URL = "https://github.com/xiaolai/everyone-can-use-english"
export const REPO_URL = "https://github.com/xiaolai/everyone-can-use-english";
// https://huggingface.co/ggerganov/whisper.cpp/tree/main
export const WHISPER_MODELS_OPTIONS = [

View File

@@ -156,7 +156,7 @@
"autoCenter": "auto center",
"inlineCaption": "inline caption",
"autoScroll": "auto scroll",
"translate:": "translate",
"translate": "translate",
"displayIpa": "display IPA",
"detail": "detail",
"remove": "remove",
@@ -295,7 +295,13 @@
"advancedSettings": "Advanced settings",
"advanced": "Advanced",
"language": "Language",
"sttAiModel": "STT AI model",
"sttAiService": "STT AI service",
"local": "Local",
"localSpeechToTextDescription": "Use local whisper model to transcribe.",
"azureAi": "Azure AI",
"azureSpeechToTextDescription": "Use Azure AI Speech to transcribe.",
"cloudflareAi": "Cloudflare AI",
"cloudflareSpeechToTextDescription": "Use Cloudflare AI Worker to transcribe.",
"checkingWhisper": "Checking whisper status",
"pleaseDownloadWhisperModelFirst": "Please download whisper model first",
"whisperIsWorkingGood": "Whisper is working good",

View File

@@ -156,7 +156,7 @@
"autoCenter": "自动居中",
"inlineCaption": "内联字幕",
"autoScroll": "自动滚动",
"translate:": "翻译",
"translate": "翻译",
"displayIpa": "标注音标",
"detail": "详情",
"remove": "删除",
@@ -294,7 +294,13 @@
"advancedSettingsShort": "高级设置",
"advancedSettings": "高级设置",
"language": "语言",
"sttAiModel": "语音转文本 AI 模型",
"sttAiService": "语音转文本服务",
"local": "本地",
"localSpeechToTextDescription": "使用本地 whisper 模型进行语音转文本",
"azureAi": "Azure AI",
"azureSpeechToTextDescription": "使用 Azure AI Speech 进行语音转文本",
"cloudflareAi": "Cloudflare AI",
"cloudflareSpeechToTextDescription": "使用 Cloudflare AI 进行语音转文本",
"checkingWhisper": "正在检查 Whisper",
"pleaseDownloadWhisperModelFirst": "请先下载 Whisper 模型",
"whisperIsWorkingGood": "Whisper 正常工作",

View File

@@ -71,4 +71,58 @@ export class AzureSpeechSdk {
});
});
}
async transcribe(params: {
filePath: string;
language?: string;
}): Promise<SpeechRecognitionResultType[]> {
const { filePath, language = "en-US" } = params;
const audioConfig = sdk.AudioConfig.fromWavFileInput(
fs.readFileSync(filePath)
);
// setting the recognition language to English.
this.config.speechRecognitionLanguage = language;
this.config.requestWordLevelTimestamps();
this.config.outputFormat = sdk.OutputFormat.Detailed;
// create the speech recognizer.
const reco = new sdk.SpeechRecognizer(this.config, audioConfig);
logger.debug("Start transcribe.");
let results: SpeechRecognitionResultType[] = [];
return new Promise((resolve, reject) => {
reco.recognizing = (_s, e) => {
logger.debug("Intermediate result received: ", e.result.text);
};
reco.recognized = (_s, e) => {
logger.debug("Got final result", e.result.text);
const json = e.result.properties.getProperty(
sdk.PropertyId.SpeechServiceResponse_JsonResult
);
const result = JSON.parse(json);
results = results.concat(result);
};
reco.canceled = (_s, e) => {
logger.debug("CANCELED: Reason=" + e.reason);
if (e.reason === sdk.CancellationReason.Error) {
logger.debug(`"CANCELED: ErrorCode=${e.errorCode}`);
logger.debug("CANCELED: ErrorDetails=" + e.errorDetails);
return reject(new Error(e.errorDetails));
}
reco.stopContinuousRecognitionAsync();
};
reco.sessionStopped = (_s, _e) => {
logger.debug("\n Session stopped event.");
reco.stopContinuousRecognitionAsync();
return resolve(results);
};
reco.startContinuousRecognitionAsync();
});
}
}

View File

@@ -148,7 +148,11 @@ export class Transcription extends Model<Transcription> {
await this.update({
state: "processing",
});
const { model, transcription } = await whisper.transcribe(wavFile, {
const {
engine = "whisper",
model,
transcription,
} = await whisper.transcribe(wavFile, {
force,
extra: [
"--split-on-word",
@@ -158,7 +162,7 @@ export class Transcription extends Model<Transcription> {
});
const result = whisper.groupTranscription(transcription);
this.update({
engine: "whisper",
engine,
model: model?.type,
result,
state: "finished",

View File

@@ -58,12 +58,29 @@ const dbPath = () => {
};
const whisperConfig = (): WhisperConfigType => {
const model = settings.getSync("whisper.model") as string;
let service = settings.getSync(
"whisper.service"
) as WhisperConfigType["service"];
if (!service) {
if (model) {
settings.setSync("whisper.service", "local");
service = "local";
} else {
settings.setSync("whisper.service", "azure");
service = "azure";
}
}
return {
service,
availableModels: settings.getSync(
"whisper.availableModels"
) as WhisperConfigType["availableModels"],
modelsPath: settings.getSync("whisper.modelsPath") as string,
model: settings.getSync("whisper.model") as string,
model,
};
};

View File

@@ -1,13 +1,30 @@
import { ipcMain } from "electron";
import settings from "@main/settings";
import path from "path";
import { WHISPER_MODELS_OPTIONS, PROCESS_TIMEOUT } from "@/constants";
import {
WHISPER_MODELS_OPTIONS,
PROCESS_TIMEOUT,
AI_WORKER_ENDPOINT,
} from "@/constants";
import { exec } from "child_process";
import fs from "fs-extra";
import log from "electron-log/main";
import { t } from "i18next";
import axios from "axios";
import { milisecondsToTimestamp } from "@/utils";
import { AzureSpeechSdk } from "@main/azure-speech-sdk";
import { Client } from "@/api";
import { WEB_API_URL } from "@/constants";
import { sortedUniqBy, take } from "lodash";
const logger = log.scope("whisper");
const webApi = new Client({
baseUrl: process.env.WEB_API_URL || WEB_API_URL,
accessToken: settings.getSync("user.accessToken") as string,
logger: log.scope("api/client"),
});
const MAGIC_TOKENS = ["Mrs.", "Ms.", "Mr.", "Dr.", "Prof.", "St."];
const END_OF_WORD_REGEX = /[^\.!,\?][\.!\?]/g;
class Whipser {
@@ -135,7 +152,7 @@ class Whipser {
group?: boolean;
}
): Promise<
TranscriptionSegmentType[] | TranscriptionResultSegmentGroupType[]
TranscriptionResultSegmentType[] | TranscriptionResultSegmentGroupType[]
> {
const { prompt, group = false } = options || {};
@@ -164,17 +181,123 @@ class Whipser {
}
}
async transcribe(
file: string,
options?: {
force?: boolean;
extra?: string[];
}
): Promise<Partial<WhisperOutputType>> {
if (this.config.service === "local") {
return this.transcribeFromLocal(file, options);
} else if (this.config.service === "azure") {
return this.transcribeFromAzure(file);
} else if (this.config.service === "cloudflare") {
return this.transcribeFromCloudflare(file);
} else {
throw new Error("Unknown service");
}
}
async transcribeFromAzure(file: string): Promise<Partial<WhisperOutputType>> {
const { token, region } = await webApi.generateSpeechToken();
const sdk = new AzureSpeechSdk(token, region);
const results = await sdk.transcribe({
filePath: file,
});
const transcription: TranscriptionResultSegmentType[] = [];
results.forEach((result) => {
logger.debug(result);
const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
const words = best.Display.trim().split(" ");
best.Words.map((word, index) => {
let text = word.Word;
if (words.length === best.Words.length) {
text = words[index];
}
if (
index === best.Words.length - 1 &&
!text.trim().match(END_OF_WORD_REGEX)
) {
text = text + ".";
}
transcription.push({
offsets: {
from: word.Offset / 1e4,
to: (word.Offset + word.Duration) / 1e4,
},
timestamps: {
from: milisecondsToTimestamp(word.Offset / 1e4),
to: milisecondsToTimestamp((word.Offset + word.Duration) * 1e4),
},
text,
});
});
});
return {
engine: "azure",
model: {
type: "Azure AI Speech",
},
transcription,
};
}
async transcribeFromCloudflare(
file: string
): Promise<Partial<WhisperOutputType>> {
logger.debug("transcribing from CloudFlare");
const data = fs.readFileSync(file);
const res: CfWhipserOutputType = (
await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, data)
).data;
logger.debug("transcription from Web,", res);
const transcription: TranscriptionResultSegmentType[] = res.words.map(
(word) => {
return {
offsets: {
from: word.start * 1000,
to: word.end * 1000,
},
timestamps: {
from: milisecondsToTimestamp(word.start * 1000),
to: milisecondsToTimestamp(word.end * 1000),
},
text: word.word,
};
}
);
logger.debug("converted transcription,", transcription);
return {
engine: "cloudflare",
model: {
type: "@cf/openai/whisper",
},
transcription,
};
}
/* Ensure the file is in wav format
* and 16kHz sample rate
*/
async transcribe(
async transcribeFromLocal(
file: string,
options: {
options?: {
force?: boolean;
extra?: string[];
} = {}
) {
const { force = false, extra = [] } = options;
}
): Promise<Partial<WhisperOutputType>> {
logger.debug("transcribing from local");
const { force = false, extra = [] } = options || {};
const filename = path.basename(file, path.extname(file));
const tmpDir = settings.cachePath();
const outputFile = path.join(tmpDir, filename + ".json");
@@ -232,9 +355,9 @@ class Whipser {
}
groupTranscription(
transcription: TranscriptionSegmentType[]
transcription: TranscriptionResultSegmentType[]
): TranscriptionResultSegmentGroupType[] {
const generateGroup = (group?: TranscriptionSegmentType[]) => {
const generateGroup = (group?: TranscriptionResultSegmentType[]) => {
if (!group || group.length === 0) return;
const firstWord = group[0];
@@ -255,7 +378,7 @@ class Whipser {
};
const groups: TranscriptionResultSegmentGroupType[] = [];
let group: TranscriptionSegmentType[] = [];
let group: TranscriptionResultSegmentType[] = [];
transcription.forEach((segment) => {
const text = segment.text.trim();
@@ -310,6 +433,31 @@ class Whipser {
});
});
ipcMain.handle("whisper-set-service", async (event, service) => {
if (service === "local") {
try {
await this.initialize();
settings.setSync("whisper.service", service);
this.config.service = service;
return this.config;
} catch (err) {
event.sender.send("on-notification", {
type: "error",
message: err.message,
});
}
} else if (["cloudflare", "azure"].includes(service)) {
settings.setSync("whisper.service", service);
this.config.service = service;
return this.config;
} else {
event.sender.send("on-notification", {
type: "error",
message: "Unknown service",
});
}
});
ipcMain.handle("whisper-check", async (_event) => {
return await this.check();
});

View File

@@ -326,6 +326,9 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
setModel: (model: string) => {
return ipcRenderer.invoke("whisper-set-model", model);
},
setService: (service: string) => {
return ipcRenderer.invoke("whisper-set-service", service);
},
check: () => {
return ipcRenderer.invoke("whisper-check");
},

View File

@@ -220,7 +220,11 @@ export const MediaCaption = (props: {
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent>
<DropdownMenuItem disabled={translating} onClick={translate}>
<DropdownMenuItem
className="cursor-pointer capitalize"
disabled={translating}
onClick={translate}
>
{translating ? (
<LoaderIcon className="w-4 h-4 mr-2 animate-spin" />
) : (
@@ -228,7 +232,11 @@ export const MediaCaption = (props: {
)}
<span>{t("translate")}</span>
</DropdownMenuItem>
<DropdownMenuItem disabled={ipaGenerating} onClick={toogleIPA}>
<DropdownMenuItem
className="cursor-pointer capitalize"
disabled={ipaGenerating}
onClick={toogleIPA}
>
{ipaGenerating ? (
<LoaderIcon className="w-4 h-4 mr-2 animate-spin" />
) : (

View File

@@ -8,6 +8,11 @@ import {
DialogDescription,
DialogFooter,
toast,
Select,
SelectTrigger,
SelectContent,
SelectItem,
SelectValue,
} from "@renderer/components/ui";
import { WhisperModelOptions } from "@renderer/components";
import { AppSettingsProviderContext } from "@renderer/context";
@@ -15,9 +20,8 @@ import { useContext, useEffect, useState } from "react";
import { InfoIcon, AlertCircleIcon } from "lucide-react";
export const WhisperSettings = () => {
const { whisperConfig, refreshWhisperConfig, EnjoyApp } = useContext(
AppSettingsProviderContext
);
const { whisperConfig, refreshWhisperConfig, EnjoyApp, setWhisperService } =
useContext(AppSettingsProviderContext);
const [stderr, setStderr] = useState("");
useEffect(() => {
@@ -48,7 +52,7 @@ export const WhisperSettings = () => {
<div className="flex items-start justify-between py-4">
<div className="">
<div className="flex items-center mb-2">
<span>{t("sttAiModel")}</span>
<span>{t("sttAiService")}</span>
{stderr && (
<Button
variant="ghost"
@@ -62,49 +66,74 @@ export const WhisperSettings = () => {
)}
</div>
<div className="text-sm text-muted-foreground">
{whisperConfig.model}
{whisperConfig?.service === "local" &&
t("localSpeechToTextDescription")}
{whisperConfig?.service === "azure" &&
t("azureSpeechToTextDescription")}
{whisperConfig?.service === "cloudflare" &&
t("cloudflareSpeechToTextDescription")}
</div>
</div>
<div className="flex items-center space-x-2">
<Button onClick={handleCheck} variant="secondary" size="sm">
{t("check")}
</Button>
<Dialog>
<DialogTrigger asChild>
<Button variant="secondary" size="sm">
{t("edit")}
<Select
value={whisperConfig.service}
onValueChange={(value) => {
setWhisperService(value);
}}
>
<SelectTrigger className="min-w-fit">
<SelectValue placeholder="service"></SelectValue>
</SelectTrigger>
<SelectContent>
<SelectItem value="local">{t("local")}</SelectItem>
<SelectItem value="azure">{t("azureAi")}</SelectItem>
<SelectItem value="cloudflare">{t("cloudflareAi")}</SelectItem>
</SelectContent>
</Select>
{whisperConfig.service === "local" && (
<>
<Button onClick={handleCheck} variant="secondary" size="sm">
{t("check")}
</Button>
</DialogTrigger>
<DialogContent>
<DialogHeader>{t("sttAiModel")}</DialogHeader>
<DialogDescription>
{t("chooseAIModelDependingOnYourHardware")}
</DialogDescription>
<WhisperModelOptions />
<DialogFooter>
<div className="text-xs flex items-start space-x-2">
<InfoIcon className="mr-1.5 w-4 h-4" />
<span className="flex-1 opacity-70">
{t("yourModelsWillBeDownloadedTo", {
path: whisperConfig.modelsPath,
})}
</span>
<Button
onClick={() => {
EnjoyApp.shell.openPath(whisperConfig.modelsPath);
}}
variant="default"
size="sm"
>
{t("open")}
<Dialog>
<DialogTrigger asChild>
<Button variant="secondary" size="sm">
{t("model")}
</Button>
</div>
</DialogFooter>
</DialogContent>
</Dialog>
</DialogTrigger>
<DialogContent>
<DialogHeader>{t("sttAiService")}</DialogHeader>
<DialogDescription>
{t("chooseAIModelDependingOnYourHardware")}
</DialogDescription>
<WhisperModelOptions />
<DialogFooter>
<div className="text-xs flex items-start space-x-2">
<InfoIcon className="mr-1.5 w-4 h-4" />
<span className="flex-1 opacity-70">
{t("yourModelsWillBeDownloadedTo", {
path: whisperConfig.modelsPath,
})}
</span>
<Button
onClick={() => {
EnjoyApp.shell.openPath(whisperConfig.modelsPath);
}}
variant="outline"
size="sm"
>
{t("open")}
</Button>
</div>
</DialogFooter>
</DialogContent>
</Dialog>
</>
)}
</div>
</div>
);

View File

@@ -5,7 +5,7 @@ import { cva, type VariantProps } from "class-variance-authority"
import { cn } from "@renderer/lib/utils"
const buttonVariants = cva(
"capitalize inline-flex items-center justify-center rounded-md text-sm font-medium transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50",
"capitalize inline-flex items-center justify-center rounded-md text-sm font-medium transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50 min-w-fit",
{
variants: {
variant: {

View File

@@ -68,7 +68,7 @@ export const WhisperModelOptionsPanel = () => {
onClick={() => {
EnjoyApp.shell.openPath(whisperConfig.modelsPath);
}}
variant="default"
variant="outline"
size="sm"
>
{t("open")}

View File

@@ -16,6 +16,7 @@ type AppSettingsProviderState = {
logout?: () => void;
setLibraryPath?: (path: string) => Promise<void>;
setWhisperModel?: (name: string) => Promise<void>;
setWhisperService?: (name: string) => Promise<void>;
ffmpegConfig?: FfmpegConfigType;
ffmpeg?: FFmpeg;
whisperConfig?: WhisperConfigType;
@@ -190,6 +191,13 @@ export const AppSettingsProvider = ({
});
};
const setWhisperService = async (name: WhisperConfigType["service"]) => {
return EnjoyApp.whisper.setService(name).then((config) => {
if (!config) return;
setWhisperConfig(config);
});
};
const validate = async () => {
setInitialized(Boolean(user && libraryPath));
};
@@ -208,6 +216,7 @@ export const AppSettingsProvider = ({
libraryPath,
setLibraryPath: setLibraryPathHandler,
setWhisperModel,
setWhisperService,
ffmpegConfig,
ffmpeg,
whisperConfig,

View File

@@ -16,8 +16,11 @@ export function cn(...inputs: ClassValue[]) {
}
export function secondsToTimestamp(seconds: number) {
const date = new Date(seconds * 1000);
return date.toISOString().substr(11, 8);
const h = Math.floor(seconds / 3600).toString();
const m = Math.floor((seconds % 3600) / 60).toString();
const s = Math.floor((seconds % 3600) % 60).toString();
return `${h.padStart(2, "0")}:${m.padStart(2, "0")}:${s.padStart(2, "0")}`;
}
export function humanizeDuration(

View File

@@ -186,6 +186,9 @@ type EnjoyAppType = {
config: () => Promise<WhisperConfigType>;
check: () => Promise<{ success: boolean; log: string }>;
setModel: (model: string) => Promise<WhisperConfigType>;
setService: (
service: WhisperConfigType["service"]
) => Promise<WhisperConfigType>;
transcribeBlob: (
blob: { type: string; arrayBuffer: ArrayBuffer },
prompt?: string

View File

@@ -26,6 +26,7 @@ type NotificationType = {
};
type WhisperConfigType = {
service: "local" | "azure" | "cloudflare";
availableModels: {
type: string;
name: string;
@@ -39,24 +40,25 @@ type WhisperConfigType = {
};
type WhisperOutputType = {
engine?: string;
model: {
audio: {
audio?: {
cts: number;
head: number;
layer: number;
state: number;
};
ftype: number;
mels: number;
multilingual: number;
text: {
ftype?: number;
mels?: number;
multilingual?: number;
text?: {
cts: number;
head: number;
layer: number;
state: number;
};
type: string;
vocab: number;
vocab?: number;
};
params: {
language: string;
@@ -67,19 +69,17 @@ type WhisperOutputType = {
languate: string;
};
systeminfo: string;
transcription: TranscriptionSegmentType[];
transcription: TranscriptionResultSegmentType[];
};
type TranscriptionSegmentType = {
offsets: {
from: number;
to: number;
};
type CfWhipserOutputType = {
text: string;
timestamps: {
from: string;
to: string;
};
words_count: number;
words: {
word: string;
start: number;
end: number;
}[];
};
type TransactionStateType = {

View File

@@ -62,3 +62,24 @@ type PronunciationAssessmentWordResultType = {
};
};
};
type SpeechRecognitionResultType = {
Id: string;
RecognitionStatus: string;
Offset: number;
Duration: number;
Channel: number;
DisplayText: string;
NBest: {
Confidence: number;
Lexical: string;
ITN: string;
MaskedITN: string;
Display: string;
Words: {
Word: string;
Offset: number;
Duration: number;
}[];
}[];
};

View File

@@ -66,3 +66,14 @@ export function generatePitch(peaks: Float32Array, sampleRate: number) {
return { frequencies, baseFrequency };
}
export function milisecondsToTimestamp(ms: number) {
const hours = Math.floor(ms / 3600000).toString();
const minutes = Math.floor((ms % 3600000) / 60000).toString();
const seconds = Math.floor(((ms % 360000) % 60000) / 1000).toString();
const milliseconds = Math.floor(((ms % 360000) % 60000) % 1000).toString();
return `${hours.padStart(2, "0")}:${minutes.padStart(
2,
"0"
)}:${seconds.padStart(2, "0")},${milliseconds}`;
}