Feat: refactor STT service (#294)
* add stt hook interface * fix crypto exported to browser * refactor use-transcribe * may use openai stt * refactor: remove decprecated codes * fix undefined method
This commit is contained in:
@@ -317,6 +317,7 @@
|
|||||||
"azureSpeechToTextDescription": "Use Azure AI Speech to transcribe. It is a paid service.",
|
"azureSpeechToTextDescription": "Use Azure AI Speech to transcribe. It is a paid service.",
|
||||||
"cloudflareAi": "Cloudflare AI",
|
"cloudflareAi": "Cloudflare AI",
|
||||||
"cloudflareSpeechToTextDescription": "Use Cloudflare AI Worker to transcribe. It is in beta and free for now.",
|
"cloudflareSpeechToTextDescription": "Use Cloudflare AI Worker to transcribe. It is in beta and free for now.",
|
||||||
|
"openaiSpeechToTextDescription": "Use openAI to transcribe using your own key.",
|
||||||
"checkingWhisper": "Checking whisper status",
|
"checkingWhisper": "Checking whisper status",
|
||||||
"pleaseDownloadWhisperModelFirst": "Please download whisper model first",
|
"pleaseDownloadWhisperModelFirst": "Please download whisper model first",
|
||||||
"whisperIsWorkingGood": "Whisper is working good",
|
"whisperIsWorkingGood": "Whisper is working good",
|
||||||
|
|||||||
@@ -316,6 +316,7 @@
|
|||||||
"azureSpeechToTextDescription": "使用 Azure AI Speech 进行语音转文本,收费服务",
|
"azureSpeechToTextDescription": "使用 Azure AI Speech 进行语音转文本,收费服务",
|
||||||
"cloudflareAi": "Cloudflare AI",
|
"cloudflareAi": "Cloudflare AI",
|
||||||
"cloudflareSpeechToTextDescription": "使用 Cloudflare AI 进行语音转文本,目前免费",
|
"cloudflareSpeechToTextDescription": "使用 Cloudflare AI 进行语音转文本,目前免费",
|
||||||
|
"openaiSpeechToTextDescription": "使用 OpenAI 进行语音转文本(需要 API 密钥)",
|
||||||
"checkingWhisper": "正在检查 Whisper",
|
"checkingWhisper": "正在检查 Whisper",
|
||||||
"pleaseDownloadWhisperModelFirst": "请先下载 Whisper 模型",
|
"pleaseDownloadWhisperModelFirst": "请先下载 Whisper 模型",
|
||||||
"whisperIsWorkingGood": "Whisper 正常工作",
|
"whisperIsWorkingGood": "Whisper 正常工作",
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import { Speech } from "@main/db/models";
|
|||||||
import fs from "fs-extra";
|
import fs from "fs-extra";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import settings from "@main/settings";
|
import settings from "@main/settings";
|
||||||
import { hashFile } from "@/utils";
|
import { hashFile } from "@main/utils";
|
||||||
|
|
||||||
class SpeechesHandler {
|
class SpeechesHandler {
|
||||||
private async create(
|
private async create(
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
import { ipcMain, IpcMainEvent } from "electron";
|
import { ipcMain, IpcMainEvent } from "electron";
|
||||||
import { Transcription, Audio, Video } from "@main/db/models";
|
import { Transcription, Audio, Video } from "@main/db/models";
|
||||||
import { WhereOptions, Attributes } from "sequelize";
|
import { Attributes } from "sequelize";
|
||||||
import { t } from "i18next";
|
|
||||||
import log from "electron-log/main";
|
import log from "electron-log/main";
|
||||||
|
|
||||||
const logger = log.scope("db/handlers/transcriptions-handler");
|
const logger = log.scope("db/handlers/transcriptions-handler");
|
||||||
@@ -44,7 +43,7 @@ class TranscriptionsHandler {
|
|||||||
id: string,
|
id: string,
|
||||||
params: Attributes<Transcription>
|
params: Attributes<Transcription>
|
||||||
) {
|
) {
|
||||||
const { result } = params;
|
const { result, engine, model, state } = params;
|
||||||
|
|
||||||
return Transcription.findOne({
|
return Transcription.findOne({
|
||||||
where: { id },
|
where: { id },
|
||||||
@@ -53,63 +52,7 @@ class TranscriptionsHandler {
|
|||||||
if (!transcription) {
|
if (!transcription) {
|
||||||
throw new Error("models.transcription.notFound");
|
throw new Error("models.transcription.notFound");
|
||||||
}
|
}
|
||||||
transcription.update({ result });
|
transcription.update({ result, engine, model, state });
|
||||||
})
|
|
||||||
.catch((err) => {
|
|
||||||
logger.error(err);
|
|
||||||
event.sender.send("on-notification", {
|
|
||||||
type: "error",
|
|
||||||
message: err.message,
|
|
||||||
});
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
private async process(
|
|
||||||
event: IpcMainEvent,
|
|
||||||
where: WhereOptions<Attributes<Transcription>>,
|
|
||||||
options?: {
|
|
||||||
force?: boolean;
|
|
||||||
blob: {
|
|
||||||
type: string;
|
|
||||||
arrayBuffer: ArrayBuffer;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
) {
|
|
||||||
const { force = true, blob } = options || {};
|
|
||||||
return Transcription.findOne({
|
|
||||||
where: {
|
|
||||||
...where,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
.then((transcription) => {
|
|
||||||
if (!transcription) {
|
|
||||||
throw new Error("models.transcription.notFound");
|
|
||||||
}
|
|
||||||
|
|
||||||
const interval = setInterval(() => {
|
|
||||||
event.sender.send("on-notification", {
|
|
||||||
type: "warning",
|
|
||||||
message: t("stillTranscribing"),
|
|
||||||
});
|
|
||||||
}, 1000 * 10);
|
|
||||||
|
|
||||||
transcription
|
|
||||||
.process({
|
|
||||||
force,
|
|
||||||
wavFileBlob: blob,
|
|
||||||
onProgress: (progress: number) => {
|
|
||||||
event.sender.send("transcription-on-progress", progress);
|
|
||||||
},
|
|
||||||
})
|
|
||||||
.catch((err) => {
|
|
||||||
event.sender.send("on-notification", {
|
|
||||||
type: "error",
|
|
||||||
message: err.message,
|
|
||||||
});
|
|
||||||
})
|
|
||||||
.finally(() => {
|
|
||||||
clearInterval(interval);
|
|
||||||
});
|
|
||||||
})
|
})
|
||||||
.catch((err) => {
|
.catch((err) => {
|
||||||
logger.error(err);
|
logger.error(err);
|
||||||
@@ -122,7 +65,6 @@ class TranscriptionsHandler {
|
|||||||
|
|
||||||
register() {
|
register() {
|
||||||
ipcMain.handle("transcriptions-find-or-create", this.findOrCreate);
|
ipcMain.handle("transcriptions-find-or-create", this.findOrCreate);
|
||||||
ipcMain.handle("transcriptions-process", this.process);
|
|
||||||
ipcMain.handle("transcriptions-update", this.update);
|
ipcMain.handle("transcriptions-update", this.update);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ import {
|
|||||||
import { Recording, Speech, Transcription, Video } from "@main/db/models";
|
import { Recording, Speech, Transcription, Video } from "@main/db/models";
|
||||||
import settings from "@main/settings";
|
import settings from "@main/settings";
|
||||||
import { AudioFormats, VideoFormats, WEB_API_URL } from "@/constants";
|
import { AudioFormats, VideoFormats, WEB_API_URL } from "@/constants";
|
||||||
import { hashFile } from "@/utils";
|
import { hashFile } from "@main/utils";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import fs from "fs-extra";
|
import fs from "fs-extra";
|
||||||
import { t } from "i18next";
|
import { t } from "i18next";
|
||||||
@@ -191,15 +191,6 @@ export class Audio extends Model<Audio> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterCreate
|
|
||||||
static transcribeAsync(audio: Audio) {
|
|
||||||
if (settings.ffmpegConfig().ready) {
|
|
||||||
setTimeout(() => {
|
|
||||||
audio.transcribe();
|
|
||||||
}, 500);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterCreate
|
@AfterCreate
|
||||||
static autoSync(audio: Audio) {
|
static autoSync(audio: Audio) {
|
||||||
// auto sync should not block the main thread
|
// auto sync should not block the main thread
|
||||||
@@ -332,38 +323,6 @@ export class Audio extends Model<Audio> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// STT using whisper
|
|
||||||
async transcribe() {
|
|
||||||
Transcription.findOrCreate({
|
|
||||||
where: {
|
|
||||||
targetId: this.id,
|
|
||||||
targetType: "Audio",
|
|
||||||
},
|
|
||||||
defaults: {
|
|
||||||
targetId: this.id,
|
|
||||||
targetType: "Audio",
|
|
||||||
targetMd5: this.md5,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
.then(([transcription, _created]) => {
|
|
||||||
if (transcription.state === "pending") {
|
|
||||||
transcription.process();
|
|
||||||
} else if (transcription.state === "finished") {
|
|
||||||
transcription.process({ force: true });
|
|
||||||
} else if (transcription.state === "processing") {
|
|
||||||
logger.warn(
|
|
||||||
`[${transcription.getDataValue("id")}]`,
|
|
||||||
"Transcription is processing."
|
|
||||||
);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.catch((err) => {
|
|
||||||
logger.error(err);
|
|
||||||
|
|
||||||
throw err;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static notify(audio: Audio, action: "create" | "update" | "destroy") {
|
static notify(audio: Audio, action: "create" | "update" | "destroy") {
|
||||||
if (!mainWindow.win) return;
|
if (!mainWindow.win) return;
|
||||||
|
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ import fs from "fs-extra";
|
|||||||
import path from "path";
|
import path from "path";
|
||||||
import Ffmpeg from "@main/ffmpeg";
|
import Ffmpeg from "@main/ffmpeg";
|
||||||
import whisper from "@main/whisper";
|
import whisper from "@main/whisper";
|
||||||
import { hashFile } from "@/utils";
|
import { hashFile } from "@main/utils";
|
||||||
import { WEB_API_URL } from "@/constants";
|
import { WEB_API_URL } from "@/constants";
|
||||||
import proxyAgent from "@main/proxy-agent";
|
import proxyAgent from "@main/proxy-agent";
|
||||||
|
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ import { Audio, PronunciationAssessment, Video } from "@main/db/models";
|
|||||||
import fs from "fs-extra";
|
import fs from "fs-extra";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import settings from "@main/settings";
|
import settings from "@main/settings";
|
||||||
import { hashFile } from "@/utils";
|
import { hashFile } from "@main/utils";
|
||||||
import log from "electron-log/main";
|
import log from "electron-log/main";
|
||||||
import storage from "@main/storage";
|
import storage from "@main/storage";
|
||||||
import { Client } from "@/api";
|
import { Client } from "@/api";
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ import path from "path";
|
|||||||
import settings from "@main/settings";
|
import settings from "@main/settings";
|
||||||
import OpenAI, { type ClientOptions } from "openai";
|
import OpenAI, { type ClientOptions } from "openai";
|
||||||
import { t } from "i18next";
|
import { t } from "i18next";
|
||||||
import { hashFile } from "@/utils";
|
import { hashFile } from "@main/utils";
|
||||||
import { Audio, Message } from "@main/db/models";
|
import { Audio, Message } from "@main/db/models";
|
||||||
import log from "electron-log/main";
|
import log from "electron-log/main";
|
||||||
import { WEB_API_URL } from "@/constants";
|
import { WEB_API_URL } from "@/constants";
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
import {
|
import {
|
||||||
AfterCreate,
|
|
||||||
AfterUpdate,
|
AfterUpdate,
|
||||||
AfterDestroy,
|
AfterDestroy,
|
||||||
AfterFind,
|
AfterFind,
|
||||||
@@ -13,18 +12,13 @@ import {
|
|||||||
Unique,
|
Unique,
|
||||||
} from "sequelize-typescript";
|
} from "sequelize-typescript";
|
||||||
import { Audio, Video } from "@main/db/models";
|
import { Audio, Video } from "@main/db/models";
|
||||||
import whisper from "@main/whisper";
|
|
||||||
import mainWindow from "@main/window";
|
import mainWindow from "@main/window";
|
||||||
import log from "electron-log/main";
|
import log from "electron-log/main";
|
||||||
import { Client } from "@/api";
|
import { Client } from "@/api";
|
||||||
import { WEB_API_URL, PROCESS_TIMEOUT } from "@/constants";
|
import { WEB_API_URL, PROCESS_TIMEOUT } from "@/constants";
|
||||||
import settings from "@main/settings";
|
import settings from "@main/settings";
|
||||||
import Ffmpeg from "@main/ffmpeg";
|
|
||||||
import path from "path";
|
|
||||||
import fs from "fs-extra";
|
|
||||||
|
|
||||||
const logger = log.scope("db/models/transcription");
|
const logger = log.scope("db/models/transcription");
|
||||||
|
|
||||||
@Table({
|
@Table({
|
||||||
modelName: "Transcription",
|
modelName: "Transcription",
|
||||||
tableName: "transcriptions",
|
tableName: "transcriptions",
|
||||||
@@ -80,120 +74,13 @@ export class Transcription extends Model<Transcription> {
|
|||||||
const webApi = new Client({
|
const webApi = new Client({
|
||||||
baseUrl: process.env.WEB_API_URL || WEB_API_URL,
|
baseUrl: process.env.WEB_API_URL || WEB_API_URL,
|
||||||
accessToken: settings.getSync("user.accessToken") as string,
|
accessToken: settings.getSync("user.accessToken") as string,
|
||||||
logger: log.scope("api/client"),
|
logger,
|
||||||
});
|
});
|
||||||
return webApi.syncTranscription(this.toJSON()).then(() => {
|
return webApi.syncTranscription(this.toJSON()).then(() => {
|
||||||
this.update({ syncedAt: new Date() });
|
this.update({ syncedAt: new Date() });
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// STT using whisper
|
|
||||||
async process(
|
|
||||||
options: {
|
|
||||||
force?: boolean;
|
|
||||||
wavFileBlob?: { type: string; arrayBuffer: ArrayBuffer };
|
|
||||||
onProgress?: (progress: number) => void;
|
|
||||||
} = {}
|
|
||||||
) {
|
|
||||||
if (this.getDataValue("state") === "processing") return;
|
|
||||||
|
|
||||||
const { force = false, wavFileBlob, onProgress } = options;
|
|
||||||
|
|
||||||
logger.info(`[${this.getDataValue("id")}]`, "Start to transcribe.");
|
|
||||||
|
|
||||||
let filePath = "";
|
|
||||||
if (this.targetType === "Audio") {
|
|
||||||
filePath = (await Audio.findByPk(this.targetId)).filePath;
|
|
||||||
} else if (this.targetType === "Video") {
|
|
||||||
filePath = (await Video.findByPk(this.targetId)).filePath;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!filePath) {
|
|
||||||
logger.error(`[${this.getDataValue("id")}]`, "No file path.");
|
|
||||||
throw new Error("No file path.");
|
|
||||||
}
|
|
||||||
|
|
||||||
let wavFile: string = filePath;
|
|
||||||
|
|
||||||
const tmpDir = settings.cachePath();
|
|
||||||
const outputFile = path.join(
|
|
||||||
tmpDir,
|
|
||||||
path.basename(filePath, path.extname(filePath)) + ".wav"
|
|
||||||
);
|
|
||||||
|
|
||||||
if (wavFileBlob) {
|
|
||||||
const format = wavFileBlob.type.split("/")[1];
|
|
||||||
|
|
||||||
if (format !== "wav") {
|
|
||||||
throw new Error("Only wav format is supported");
|
|
||||||
}
|
|
||||||
|
|
||||||
await fs.outputFile(outputFile, Buffer.from(wavFileBlob.arrayBuffer));
|
|
||||||
wavFile = outputFile;
|
|
||||||
} else if (settings.ffmpegConfig().ready) {
|
|
||||||
const ffmpeg = new Ffmpeg();
|
|
||||||
try {
|
|
||||||
wavFile = await ffmpeg.prepareForWhisper(
|
|
||||||
filePath,
|
|
||||||
path.join(
|
|
||||||
tmpDir,
|
|
||||||
path.basename(filePath, path.extname(filePath)) + ".wav"
|
|
||||||
)
|
|
||||||
);
|
|
||||||
} catch (err) {
|
|
||||||
logger.error("ffmpeg error", err);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
await this.update({
|
|
||||||
state: "processing",
|
|
||||||
});
|
|
||||||
const {
|
|
||||||
engine = "whisper",
|
|
||||||
model,
|
|
||||||
transcription,
|
|
||||||
} = await whisper.transcribe(wavFile, {
|
|
||||||
force,
|
|
||||||
extra: [
|
|
||||||
"--split-on-word",
|
|
||||||
"--max-len",
|
|
||||||
"1",
|
|
||||||
"--prompt",
|
|
||||||
`"Hello! Welcome to listen to this audio."`,
|
|
||||||
],
|
|
||||||
onProgress,
|
|
||||||
});
|
|
||||||
const result = whisper.groupTranscription(transcription);
|
|
||||||
this.update({
|
|
||||||
engine,
|
|
||||||
model: model?.type,
|
|
||||||
result,
|
|
||||||
state: "finished",
|
|
||||||
}).then(() => this.sync());
|
|
||||||
|
|
||||||
logger.info(`[${this.getDataValue("id")}]`, "Transcription finished.");
|
|
||||||
} catch (err) {
|
|
||||||
logger.error(
|
|
||||||
`[${this.getDataValue("id")}]`,
|
|
||||||
"Transcription not finished.",
|
|
||||||
err
|
|
||||||
);
|
|
||||||
this.update({
|
|
||||||
state: "pending",
|
|
||||||
});
|
|
||||||
|
|
||||||
throw err;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterCreate
|
|
||||||
static startTranscribeAsync(transcription: Transcription) {
|
|
||||||
setTimeout(() => {
|
|
||||||
transcription.process();
|
|
||||||
}, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterUpdate
|
@AfterUpdate
|
||||||
static notifyForUpdate(transcription: Transcription) {
|
static notifyForUpdate(transcription: Transcription) {
|
||||||
this.notify(transcription, "update");
|
this.notify(transcription, "update");
|
||||||
|
|||||||
@@ -17,7 +17,7 @@ import {
|
|||||||
import { Audio, Recording, Speech, Transcription } from "@main/db/models";
|
import { Audio, Recording, Speech, Transcription } from "@main/db/models";
|
||||||
import settings from "@main/settings";
|
import settings from "@main/settings";
|
||||||
import { AudioFormats, VideoFormats, WEB_API_URL } from "@/constants";
|
import { AudioFormats, VideoFormats, WEB_API_URL } from "@/constants";
|
||||||
import { hashFile } from "@/utils";
|
import { hashFile } from "@main/utils";
|
||||||
import path from "path";
|
import path from "path";
|
||||||
import fs from "fs-extra";
|
import fs from "fs-extra";
|
||||||
import { t } from "i18next";
|
import { t } from "i18next";
|
||||||
@@ -213,15 +213,6 @@ export class Video extends Model<Video> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@AfterCreate
|
|
||||||
static transcribeAsync(video: Video) {
|
|
||||||
if (settings.ffmpegConfig().ready) {
|
|
||||||
setTimeout(() => {
|
|
||||||
video.transcribe();
|
|
||||||
}, 500);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
@AfterCreate
|
@AfterCreate
|
||||||
static autoSync(video: Video) {
|
static autoSync(video: Video) {
|
||||||
// auto sync should not block the main thread
|
// auto sync should not block the main thread
|
||||||
@@ -355,37 +346,6 @@ export class Video extends Model<Video> {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async transcribe() {
|
|
||||||
Transcription.findOrCreate({
|
|
||||||
where: {
|
|
||||||
targetId: this.id,
|
|
||||||
targetType: "Video",
|
|
||||||
},
|
|
||||||
defaults: {
|
|
||||||
targetId: this.id,
|
|
||||||
targetType: "Video",
|
|
||||||
targetMd5: this.md5,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
.then(([transcription, _created]) => {
|
|
||||||
if (transcription.state === "pending") {
|
|
||||||
transcription.process();
|
|
||||||
} else if (transcription.state === "finished") {
|
|
||||||
transcription.process({ force: true });
|
|
||||||
} else if (transcription.state === "processing") {
|
|
||||||
logger.warn(
|
|
||||||
`[${transcription.getDataValue("id")}]`,
|
|
||||||
"Transcription is processing."
|
|
||||||
);
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.catch((err) => {
|
|
||||||
logger.error(err);
|
|
||||||
|
|
||||||
throw err;
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
static notify(video: Video, action: "create" | "update" | "destroy") {
|
static notify(video: Video, action: "create" | "update" | "destroy") {
|
||||||
if (!mainWindow.win) return;
|
if (!mainWindow.win) return;
|
||||||
|
|
||||||
|
|||||||
38
enjoy/src/main/utils.ts
Normal file
38
enjoy/src/main/utils.ts
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
import { createHash } from "crypto";
|
||||||
|
import { createReadStream } from "fs";
|
||||||
|
|
||||||
|
export function hashFile(
|
||||||
|
path: string,
|
||||||
|
options: { algo: string }
|
||||||
|
): Promise<string> {
|
||||||
|
const algo = options.algo || "md5";
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const hash = createHash(algo);
|
||||||
|
const stream = createReadStream(path);
|
||||||
|
stream.on("error", reject);
|
||||||
|
stream.on("data", (chunk) => hash.update(chunk));
|
||||||
|
stream.on("end", () => resolve(hash.digest("hex")));
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
export function hashBlob(
|
||||||
|
blob: Blob,
|
||||||
|
options: { algo: string }
|
||||||
|
): Promise<string> {
|
||||||
|
const algo = options.algo || "md5";
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
const hash = createHash(algo);
|
||||||
|
const reader = new FileReader();
|
||||||
|
reader.onload = () => {
|
||||||
|
if (reader.result instanceof ArrayBuffer) {
|
||||||
|
const buffer = Buffer.from(reader.result);
|
||||||
|
hash.update(buffer);
|
||||||
|
resolve(hash.digest("hex"));
|
||||||
|
} else {
|
||||||
|
reject(new Error("Unexpected result from FileReader"));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
reader.onerror = reject;
|
||||||
|
reader.readAsArrayBuffer(blob);
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -152,162 +152,17 @@ class Whipser {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
async transcribeBlob(
|
|
||||||
blob: { type: string; arrayBuffer: ArrayBuffer },
|
|
||||||
options?: {
|
|
||||||
prompt?: string;
|
|
||||||
group?: boolean;
|
|
||||||
}
|
|
||||||
): Promise<
|
|
||||||
TranscriptionResultSegmentType[] | TranscriptionResultSegmentGroupType[]
|
|
||||||
> {
|
|
||||||
const { prompt, group = false } = options || {};
|
|
||||||
|
|
||||||
const format = blob.type.split("/")[1];
|
|
||||||
|
|
||||||
if (format !== "wav") {
|
|
||||||
throw new Error("Only wav format is supported");
|
|
||||||
}
|
|
||||||
|
|
||||||
const tempfile = path.join(settings.cachePath(), `${Date.now()}.${format}`);
|
|
||||||
await fs.outputFile(tempfile, Buffer.from(blob.arrayBuffer));
|
|
||||||
|
|
||||||
const extra = [];
|
|
||||||
if (prompt) {
|
|
||||||
extra.push(`--prompt "${prompt.replace(/"/g, '\\"')}"`);
|
|
||||||
}
|
|
||||||
const { transcription } = await this.transcribe(tempfile, {
|
|
||||||
force: true,
|
|
||||||
extra,
|
|
||||||
});
|
|
||||||
|
|
||||||
if (group) {
|
|
||||||
return this.groupTranscription(transcription);
|
|
||||||
} else {
|
|
||||||
return transcription;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async transcribe(
|
|
||||||
file: string,
|
|
||||||
options?: {
|
|
||||||
force?: boolean;
|
|
||||||
extra?: string[];
|
|
||||||
onProgress?: (progress: number) => void;
|
|
||||||
}
|
|
||||||
): Promise<Partial<WhisperOutputType>> {
|
|
||||||
if (this.config.service === "local") {
|
|
||||||
return this.transcribeFromLocal(file, options);
|
|
||||||
} else if (this.config.service === "azure") {
|
|
||||||
return this.transcribeFromAzure(file);
|
|
||||||
} else if (this.config.service === "cloudflare") {
|
|
||||||
return this.transcribeFromCloudflare(file);
|
|
||||||
} else {
|
|
||||||
throw new Error("Unknown service");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
async transcribeFromAzure(file: string): Promise<Partial<WhisperOutputType>> {
|
|
||||||
const webApi = new Client({
|
|
||||||
baseUrl: process.env.WEB_API_URL || WEB_API_URL,
|
|
||||||
accessToken: settings.getSync("user.accessToken") as string,
|
|
||||||
logger: log.scope("api/client"),
|
|
||||||
});
|
|
||||||
const { token, region } = await webApi.generateSpeechToken();
|
|
||||||
const sdk = new AzureSpeechSdk(token, region);
|
|
||||||
|
|
||||||
const results = await sdk.transcribe({
|
|
||||||
filePath: file,
|
|
||||||
});
|
|
||||||
|
|
||||||
const transcription: TranscriptionResultSegmentType[] = [];
|
|
||||||
results.forEach((result) => {
|
|
||||||
logger.debug(result);
|
|
||||||
const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
|
|
||||||
const words = best.Display.trim().split(" ");
|
|
||||||
|
|
||||||
best.Words.map((word, index) => {
|
|
||||||
let text = word.Word;
|
|
||||||
if (words.length === best.Words.length) {
|
|
||||||
text = words[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
|
||||||
index === best.Words.length - 1 &&
|
|
||||||
!text.trim().match(END_OF_WORD_REGEX)
|
|
||||||
) {
|
|
||||||
text = text + ".";
|
|
||||||
}
|
|
||||||
|
|
||||||
transcription.push({
|
|
||||||
offsets: {
|
|
||||||
from: word.Offset / 1e4,
|
|
||||||
to: (word.Offset + word.Duration) / 1e4,
|
|
||||||
},
|
|
||||||
timestamps: {
|
|
||||||
from: milisecondsToTimestamp(word.Offset / 1e4),
|
|
||||||
to: milisecondsToTimestamp((word.Offset + word.Duration) * 1e4),
|
|
||||||
},
|
|
||||||
text,
|
|
||||||
});
|
|
||||||
});
|
|
||||||
});
|
|
||||||
|
|
||||||
return {
|
|
||||||
engine: "azure",
|
|
||||||
model: {
|
|
||||||
type: "Azure AI Speech",
|
|
||||||
},
|
|
||||||
transcription,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
async transcribeFromCloudflare(
|
|
||||||
file: string
|
|
||||||
): Promise<Partial<WhisperOutputType>> {
|
|
||||||
logger.debug("transcribing from CloudFlare");
|
|
||||||
|
|
||||||
const data = fs.readFileSync(file);
|
|
||||||
const res: CfWhipserOutputType = (
|
|
||||||
await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, data, {
|
|
||||||
headers: {
|
|
||||||
Authorization: `Bearer ${settings.getSync("user.accessToken")}`,
|
|
||||||
},
|
|
||||||
})
|
|
||||||
).data;
|
|
||||||
logger.debug("transcription from Web,", res);
|
|
||||||
|
|
||||||
const transcription: TranscriptionResultSegmentType[] = res.words.map(
|
|
||||||
(word) => {
|
|
||||||
return {
|
|
||||||
offsets: {
|
|
||||||
from: word.start * 1000,
|
|
||||||
to: word.end * 1000,
|
|
||||||
},
|
|
||||||
timestamps: {
|
|
||||||
from: milisecondsToTimestamp(word.start * 1000),
|
|
||||||
to: milisecondsToTimestamp(word.end * 1000),
|
|
||||||
},
|
|
||||||
text: word.word,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
);
|
|
||||||
logger.debug("converted transcription,", transcription);
|
|
||||||
|
|
||||||
return {
|
|
||||||
engine: "cloudflare",
|
|
||||||
model: {
|
|
||||||
type: "@cf/openai/whisper",
|
|
||||||
},
|
|
||||||
transcription,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Ensure the file is in wav format
|
/* Ensure the file is in wav format
|
||||||
* and 16kHz sample rate
|
* and 16kHz sample rate
|
||||||
*/
|
*/
|
||||||
async transcribeFromLocal(
|
async transcribe(
|
||||||
file: string,
|
params: {
|
||||||
|
file?: string;
|
||||||
|
blob?: {
|
||||||
|
type: string;
|
||||||
|
arrayBuffer: ArrayBuffer;
|
||||||
|
};
|
||||||
|
},
|
||||||
options?: {
|
options?: {
|
||||||
force?: boolean;
|
force?: boolean;
|
||||||
extra?: string[];
|
extra?: string[];
|
||||||
@@ -315,6 +170,28 @@ class Whipser {
|
|||||||
}
|
}
|
||||||
): Promise<Partial<WhisperOutputType>> {
|
): Promise<Partial<WhisperOutputType>> {
|
||||||
logger.debug("transcribing from local");
|
logger.debug("transcribing from local");
|
||||||
|
|
||||||
|
const { blob } = params;
|
||||||
|
let { file } = params;
|
||||||
|
if (!file && !blob) {
|
||||||
|
throw new Error("No file or blob provided");
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.currentModel()) {
|
||||||
|
throw new Error(t("pleaseDownloadWhisperModelFirst"));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (blob) {
|
||||||
|
const format = blob.type.split("/")[1];
|
||||||
|
|
||||||
|
if (format !== "wav") {
|
||||||
|
throw new Error("Only wav format is supported");
|
||||||
|
}
|
||||||
|
|
||||||
|
file = path.join(settings.cachePath(), `${Date.now()}.${format}`);
|
||||||
|
await fs.outputFile(file, Buffer.from(blob.arrayBuffer));
|
||||||
|
}
|
||||||
|
|
||||||
const { force = false, extra = [], onProgress } = options || {};
|
const { force = false, extra = [], onProgress } = options || {};
|
||||||
const filename = path.basename(file, path.extname(file));
|
const filename = path.basename(file, path.extname(file));
|
||||||
const tmpDir = settings.cachePath();
|
const tmpDir = settings.cachePath();
|
||||||
@@ -326,46 +203,35 @@ class Whipser {
|
|||||||
return fs.readJson(outputFile);
|
return fs.readJson(outputFile);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!this.currentModel()) {
|
const commandArguments = [
|
||||||
throw new Error(t("pleaseDownloadWhisperModelFirst"));
|
"--file",
|
||||||
}
|
file,
|
||||||
|
"--model",
|
||||||
const command = [
|
this.currentModel(),
|
||||||
`"${this.binMain}"`,
|
|
||||||
`--file "${file}"`,
|
|
||||||
`--model "${this.currentModel()}"`,
|
|
||||||
"--output-json",
|
"--output-json",
|
||||||
`--output-file "${path.join(tmpDir, filename)}"`,
|
"--output-file",
|
||||||
|
path.join(tmpDir, filename),
|
||||||
"-pp",
|
"-pp",
|
||||||
|
"--split-on-word",
|
||||||
|
"--max-len",
|
||||||
|
"1",
|
||||||
...extra,
|
...extra,
|
||||||
].join(" ");
|
];
|
||||||
|
|
||||||
logger.info(`Running command: ${command}`);
|
logger.info(
|
||||||
|
`Running command: ${this.binMain} ${commandArguments.join(" ")}`
|
||||||
const transcribe = spawn(
|
|
||||||
this.binMain,
|
|
||||||
[
|
|
||||||
"--file",
|
|
||||||
file,
|
|
||||||
"--model",
|
|
||||||
this.currentModel(),
|
|
||||||
"--output-json",
|
|
||||||
"--output-file",
|
|
||||||
path.join(tmpDir, filename),
|
|
||||||
"-pp",
|
|
||||||
...extra,
|
|
||||||
],
|
|
||||||
{
|
|
||||||
timeout: PROCESS_TIMEOUT,
|
|
||||||
}
|
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const command = spawn(this.binMain, commandArguments, {
|
||||||
|
timeout: PROCESS_TIMEOUT,
|
||||||
|
});
|
||||||
|
|
||||||
return new Promise((resolve, reject) => {
|
return new Promise((resolve, reject) => {
|
||||||
transcribe.stdout.on("data", (data) => {
|
command.stdout.on("data", (data) => {
|
||||||
logger.debug(`stdout: ${data}`);
|
logger.debug(`stdout: ${data}`);
|
||||||
});
|
});
|
||||||
|
|
||||||
transcribe.stderr.on("data", (data) => {
|
command.stderr.on("data", (data) => {
|
||||||
const output = data.toString();
|
const output = data.toString();
|
||||||
logger.error(`stderr: ${output}`);
|
logger.error(`stderr: ${output}`);
|
||||||
if (output.startsWith("whisper_print_progress_callback")) {
|
if (output.startsWith("whisper_print_progress_callback")) {
|
||||||
@@ -374,16 +240,16 @@ class Whipser {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
transcribe.on("exit", (code) => {
|
command.on("exit", (code) => {
|
||||||
logger.info(`transcribe process exited with code ${code}`);
|
logger.info(`transcribe process exited with code ${code}`);
|
||||||
});
|
});
|
||||||
|
|
||||||
transcribe.on("error", (err) => {
|
command.on("error", (err) => {
|
||||||
logger.error("transcribe error", err.message);
|
logger.error("transcribe error", err.message);
|
||||||
reject(err);
|
reject(err);
|
||||||
});
|
});
|
||||||
|
|
||||||
transcribe.on("close", () => {
|
command.on("close", () => {
|
||||||
if (fs.pathExistsSync(outputFile)) {
|
if (fs.pathExistsSync(outputFile)) {
|
||||||
resolve(fs.readJson(outputFile));
|
resolve(fs.readJson(outputFile));
|
||||||
} else {
|
} else {
|
||||||
@@ -393,57 +259,6 @@ class Whipser {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
groupTranscription(
|
|
||||||
transcription: TranscriptionResultSegmentType[]
|
|
||||||
): TranscriptionResultSegmentGroupType[] {
|
|
||||||
const generateGroup = (group?: TranscriptionResultSegmentType[]) => {
|
|
||||||
if (!group || group.length === 0) return;
|
|
||||||
|
|
||||||
const firstWord = group[0];
|
|
||||||
const lastWord = group[group.length - 1];
|
|
||||||
|
|
||||||
return {
|
|
||||||
offsets: {
|
|
||||||
from: firstWord.offsets.from,
|
|
||||||
to: lastWord.offsets.to,
|
|
||||||
},
|
|
||||||
text: group.map((w) => w.text.trim()).join(" "),
|
|
||||||
timestamps: {
|
|
||||||
from: firstWord.timestamps.from,
|
|
||||||
to: lastWord.timestamps.to,
|
|
||||||
},
|
|
||||||
segments: group,
|
|
||||||
};
|
|
||||||
};
|
|
||||||
|
|
||||||
const groups: TranscriptionResultSegmentGroupType[] = [];
|
|
||||||
let group: TranscriptionResultSegmentType[] = [];
|
|
||||||
|
|
||||||
transcription.forEach((segment) => {
|
|
||||||
const text = segment.text.trim();
|
|
||||||
if (!text) return;
|
|
||||||
|
|
||||||
group.push(segment);
|
|
||||||
|
|
||||||
if (
|
|
||||||
!MAGIC_TOKENS.includes(text) &&
|
|
||||||
segment.text.trim().match(END_OF_WORD_REGEX)
|
|
||||||
) {
|
|
||||||
// Group a complete sentence;
|
|
||||||
groups.push(generateGroup(group));
|
|
||||||
|
|
||||||
// init a new group
|
|
||||||
group = [];
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Group the last group
|
|
||||||
const lastSentence = generateGroup(group);
|
|
||||||
if (lastSentence) groups.push(lastSentence);
|
|
||||||
|
|
||||||
return groups;
|
|
||||||
}
|
|
||||||
|
|
||||||
registerIpcHandlers() {
|
registerIpcHandlers() {
|
||||||
ipcMain.handle("whisper-config", async () => {
|
ipcMain.handle("whisper-config", async () => {
|
||||||
try {
|
try {
|
||||||
@@ -489,7 +304,7 @@ class Whipser {
|
|||||||
message: err.message,
|
message: err.message,
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
} else if (["cloudflare", "azure"].includes(service)) {
|
} else if (["cloudflare", "azure", "openai"].includes(service)) {
|
||||||
settings.setSync("whisper.service", service);
|
settings.setSync("whisper.service", service);
|
||||||
this.config.service = service;
|
this.config.service = service;
|
||||||
return this.config;
|
return this.config;
|
||||||
@@ -505,9 +320,14 @@ class Whipser {
|
|||||||
return await this.check();
|
return await this.check();
|
||||||
});
|
});
|
||||||
|
|
||||||
ipcMain.handle("whisper-transcribe-blob", async (event, blob, prompt) => {
|
ipcMain.handle("whisper-transcribe", async (event, params, options) => {
|
||||||
try {
|
try {
|
||||||
return await this.transcribeBlob(blob, prompt);
|
return await this.transcribe(params, {
|
||||||
|
...options,
|
||||||
|
onProgress: (progress) => {
|
||||||
|
event.sender.send("whisper-on-progress", progress);
|
||||||
|
},
|
||||||
|
});
|
||||||
} catch (err) {
|
} catch (err) {
|
||||||
event.sender.send("on-notification", {
|
event.sender.send("on-notification", {
|
||||||
type: "error",
|
type: "error",
|
||||||
|
|||||||
@@ -363,11 +363,26 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
|
|||||||
check: () => {
|
check: () => {
|
||||||
return ipcRenderer.invoke("whisper-check");
|
return ipcRenderer.invoke("whisper-check");
|
||||||
},
|
},
|
||||||
transcribeBlob: (
|
transcribe: (
|
||||||
blob: { type: string; arrayBuffer: ArrayBuffer },
|
params: {
|
||||||
prompt?: string
|
file?: string;
|
||||||
|
blob?: {
|
||||||
|
type: string;
|
||||||
|
arrayBuffer: ArrayBuffer;
|
||||||
|
};
|
||||||
|
},
|
||||||
|
options?: {
|
||||||
|
force?: boolean;
|
||||||
|
extra?: string[];
|
||||||
|
}
|
||||||
) => {
|
) => {
|
||||||
return ipcRenderer.invoke("whisper-transcribe-blob", blob, prompt);
|
return ipcRenderer.invoke("whisper-transcribe", params, options);
|
||||||
|
},
|
||||||
|
onProgress: (
|
||||||
|
callback: (event: IpcRendererEvent, progress: number) => void
|
||||||
|
) => ipcRenderer.on("whisper-on-progress", callback),
|
||||||
|
removeProgressListeners: () => {
|
||||||
|
ipcRenderer.removeAllListeners("whisper-on-progress");
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
ffmpeg: {
|
ffmpeg: {
|
||||||
@@ -425,18 +440,9 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
|
|||||||
findOrCreate: (params: any) => {
|
findOrCreate: (params: any) => {
|
||||||
return ipcRenderer.invoke("transcriptions-find-or-create", params);
|
return ipcRenderer.invoke("transcriptions-find-or-create", params);
|
||||||
},
|
},
|
||||||
process: (params: any, options: any) => {
|
|
||||||
return ipcRenderer.invoke("transcriptions-process", params, options);
|
|
||||||
},
|
|
||||||
update: (id: string, params: any) => {
|
update: (id: string, params: any) => {
|
||||||
return ipcRenderer.invoke("transcriptions-update", id, params);
|
return ipcRenderer.invoke("transcriptions-update", id, params);
|
||||||
},
|
},
|
||||||
onProgress: (
|
|
||||||
callback: (event: IpcRendererEvent, progress: number) => void
|
|
||||||
) => ipcRenderer.on("transcription-on-progress", callback),
|
|
||||||
removeProgressListeners: () => {
|
|
||||||
ipcRenderer.removeAllListeners("transcription-on-progress");
|
|
||||||
},
|
|
||||||
},
|
},
|
||||||
waveforms: {
|
waveforms: {
|
||||||
find: (id: string) => {
|
find: (id: string) => {
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import {
|
|||||||
ScrollArea,
|
ScrollArea,
|
||||||
Button,
|
Button,
|
||||||
PingPoint,
|
PingPoint,
|
||||||
|
toast,
|
||||||
} from "@renderer/components/ui";
|
} from "@renderer/components/ui";
|
||||||
import React, { useEffect, useContext, useState } from "react";
|
import React, { useEffect, useContext, useState } from "react";
|
||||||
import { t } from "i18next";
|
import { t } from "i18next";
|
||||||
@@ -19,6 +20,7 @@ import { LoaderIcon, CheckCircleIcon, MicIcon } from "lucide-react";
|
|||||||
import {
|
import {
|
||||||
DbProviderContext,
|
DbProviderContext,
|
||||||
AppSettingsProviderContext,
|
AppSettingsProviderContext,
|
||||||
|
AISettingsProviderContext,
|
||||||
} from "@renderer/context";
|
} from "@renderer/context";
|
||||||
import { useTranscribe } from "@renderer/hooks";
|
import { useTranscribe } from "@renderer/hooks";
|
||||||
|
|
||||||
@@ -32,6 +34,7 @@ export const MediaTranscription = (props: {
|
|||||||
onSelectSegment?: (index: number) => void;
|
onSelectSegment?: (index: number) => void;
|
||||||
}) => {
|
}) => {
|
||||||
const { addDblistener, removeDbListener } = useContext(DbProviderContext);
|
const { addDblistener, removeDbListener } = useContext(DbProviderContext);
|
||||||
|
const { whisperConfig } = useContext(AISettingsProviderContext);
|
||||||
const { EnjoyApp } = useContext(AppSettingsProviderContext);
|
const { EnjoyApp } = useContext(AppSettingsProviderContext);
|
||||||
const {
|
const {
|
||||||
transcription,
|
transcription,
|
||||||
@@ -55,13 +58,19 @@ export const MediaTranscription = (props: {
|
|||||||
|
|
||||||
setTranscribing(true);
|
setTranscribing(true);
|
||||||
setProgress(0);
|
setProgress(0);
|
||||||
transcribe({
|
try {
|
||||||
mediaId,
|
const { engine, model, result } = await transcribe(mediaUrl);
|
||||||
mediaType,
|
await EnjoyApp.transcriptions.update(transcription.id, {
|
||||||
mediaSrc: mediaUrl,
|
state: "finished",
|
||||||
}).finally(() => {
|
result,
|
||||||
setTranscribing(false);
|
engine,
|
||||||
});
|
model,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
toast.error(err.message);
|
||||||
|
}
|
||||||
|
|
||||||
|
setTranscribing(false);
|
||||||
};
|
};
|
||||||
|
|
||||||
const fetchSegmentStats = async () => {
|
const fetchSegmentStats = async () => {
|
||||||
@@ -80,14 +89,16 @@ export const MediaTranscription = (props: {
|
|||||||
generate();
|
generate();
|
||||||
}
|
}
|
||||||
|
|
||||||
EnjoyApp.transcriptions.onProgress((_, p: number) => {
|
if (whisperConfig.service === "local") {
|
||||||
if (p > 100) p = 100;
|
EnjoyApp.whisper.onProgress((_, p: number) => {
|
||||||
setProgress(p);
|
if (p > 100) p = 100;
|
||||||
});
|
setProgress(p);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
return () => {
|
return () => {
|
||||||
removeDbListener(fetchSegmentStats);
|
removeDbListener(fetchSegmentStats);
|
||||||
EnjoyApp.transcriptions.removeProgressListeners();
|
EnjoyApp.whisper.removeProgressListeners();
|
||||||
};
|
};
|
||||||
}, [mediaId, mediaType]);
|
}, [mediaId, mediaType]);
|
||||||
|
|
||||||
@@ -114,7 +125,9 @@ export const MediaTranscription = (props: {
|
|||||||
{transcribing || transcription.state === "processing" ? (
|
{transcribing || transcription.state === "processing" ? (
|
||||||
<>
|
<>
|
||||||
<PingPoint colorClassName="bg-yellow-500" />
|
<PingPoint colorClassName="bg-yellow-500" />
|
||||||
<div className="text-sm">{progress}%</div>
|
<div className="text-sm">
|
||||||
|
{whisperConfig.service === "local" && `${progress}%`}
|
||||||
|
</div>
|
||||||
</>
|
</>
|
||||||
) : transcription.state === "finished" ? (
|
) : transcription.state === "finished" ? (
|
||||||
<CheckCircleIcon className="text-green-500 w-4 h-4" />
|
<CheckCircleIcon className="text-green-500 w-4 h-4" />
|
||||||
|
|||||||
@@ -77,6 +77,8 @@ export const WhisperSettings = () => {
|
|||||||
t("azureSpeechToTextDescription")}
|
t("azureSpeechToTextDescription")}
|
||||||
{whisperConfig?.service === "cloudflare" &&
|
{whisperConfig?.service === "cloudflare" &&
|
||||||
t("cloudflareSpeechToTextDescription")}
|
t("cloudflareSpeechToTextDescription")}
|
||||||
|
{whisperConfig?.service === "openai" &&
|
||||||
|
t("openaiSpeechToTextDescription")}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -94,6 +96,7 @@ export const WhisperSettings = () => {
|
|||||||
<SelectItem value="local">{t("local")}</SelectItem>
|
<SelectItem value="local">{t("local")}</SelectItem>
|
||||||
<SelectItem value="azure">{t("azureAi")}</SelectItem>
|
<SelectItem value="azure">{t("azureAi")}</SelectItem>
|
||||||
<SelectItem value="cloudflare">{t("cloudflareAi")}</SelectItem>
|
<SelectItem value="cloudflare">{t("cloudflareAi")}</SelectItem>
|
||||||
|
<SelectItem value="openai">OpenAI</SelectItem>
|
||||||
</SelectContent>
|
</SelectContent>
|
||||||
</Select>
|
</Select>
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,3 @@
|
|||||||
export * from './use-transcode';
|
export * from './use-transcribe';
|
||||||
export * from './use-ai-command';
|
export * from './use-ai-command';
|
||||||
export * from './use-conversation';
|
export * from './use-conversation';
|
||||||
|
|||||||
@@ -1,58 +0,0 @@
|
|||||||
import { AppSettingsProviderContext } from "@renderer/context";
|
|
||||||
import { useContext } from "react";
|
|
||||||
import { toast } from "@renderer/components/ui";
|
|
||||||
import { t } from "i18next";
|
|
||||||
import { fetchFile } from "@ffmpeg/util";
|
|
||||||
|
|
||||||
export const useTranscribe = () => {
|
|
||||||
const { EnjoyApp, ffmpeg } = useContext(AppSettingsProviderContext);
|
|
||||||
|
|
||||||
const transcode = async (src: string, options?: string[]) => {
|
|
||||||
if (!ffmpeg?.loaded) return;
|
|
||||||
|
|
||||||
options = options || ["-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le"];
|
|
||||||
|
|
||||||
try {
|
|
||||||
const uri = new URL(src);
|
|
||||||
const input = uri.pathname.split("/").pop();
|
|
||||||
const output = input.replace(/\.[^/.]+$/, ".wav");
|
|
||||||
await ffmpeg.writeFile(input, await fetchFile(src));
|
|
||||||
await ffmpeg.exec(["-i", input, ...options, output]);
|
|
||||||
const data = await ffmpeg.readFile(output);
|
|
||||||
return new Blob([data], { type: "audio/wav" });
|
|
||||||
} catch (e) {
|
|
||||||
toast.error(t("transcodeError"));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const transcribe = async (params: {
|
|
||||||
mediaSrc: string;
|
|
||||||
mediaId: string;
|
|
||||||
mediaType: "Audio" | "Video";
|
|
||||||
}) => {
|
|
||||||
const { mediaSrc, mediaId, mediaType } = params;
|
|
||||||
const data = await transcode(mediaSrc);
|
|
||||||
let blob;
|
|
||||||
if (data) {
|
|
||||||
blob = {
|
|
||||||
type: data.type.split(";")[0],
|
|
||||||
arrayBuffer: await data.arrayBuffer(),
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
return EnjoyApp.transcriptions.process(
|
|
||||||
{
|
|
||||||
targetId: mediaId,
|
|
||||||
targetType: mediaType,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
blob,
|
|
||||||
}
|
|
||||||
);
|
|
||||||
};
|
|
||||||
|
|
||||||
return {
|
|
||||||
transcode,
|
|
||||||
transcribe,
|
|
||||||
};
|
|
||||||
};
|
|
||||||
263
enjoy/src/renderer/hooks/use-transcribe.tsx
Normal file
263
enjoy/src/renderer/hooks/use-transcribe.tsx
Normal file
@@ -0,0 +1,263 @@
|
|||||||
|
import {
|
||||||
|
AppSettingsProviderContext,
|
||||||
|
AISettingsProviderContext,
|
||||||
|
} from "@renderer/context";
|
||||||
|
import OpenAI from "openai";
|
||||||
|
import { useContext } from "react";
|
||||||
|
import { toast } from "@renderer/components/ui";
|
||||||
|
import { t } from "i18next";
|
||||||
|
import { fetchFile } from "@ffmpeg/util";
|
||||||
|
import { AI_WORKER_ENDPOINT } from "@/constants";
|
||||||
|
import * as sdk from "microsoft-cognitiveservices-speech-sdk";
|
||||||
|
import axios from "axios";
|
||||||
|
import take from "lodash/take";
|
||||||
|
import sortedUniqBy from "lodash/sortedUniqBy";
|
||||||
|
import { groupTranscription, END_OF_WORD_REGEX, milisecondsToTimestamp } from "@/utils";
|
||||||
|
|
||||||
|
export const useTranscribe = () => {
|
||||||
|
const { EnjoyApp, ffmpeg, user, webApi } = useContext(
|
||||||
|
AppSettingsProviderContext
|
||||||
|
);
|
||||||
|
const { whisperConfig, openai } = useContext(AISettingsProviderContext);
|
||||||
|
|
||||||
|
const transcode = async (src: string, options?: string[]) => {
|
||||||
|
if (!ffmpeg?.loaded) return;
|
||||||
|
|
||||||
|
options = options || ["-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le"];
|
||||||
|
|
||||||
|
try {
|
||||||
|
const uri = new URL(src);
|
||||||
|
const input = uri.pathname.split("/").pop();
|
||||||
|
const output = input.replace(/\.[^/.]+$/, ".wav");
|
||||||
|
await ffmpeg.writeFile(input, await fetchFile(src));
|
||||||
|
await ffmpeg.exec(["-i", input, ...options, output]);
|
||||||
|
const data = await ffmpeg.readFile(output);
|
||||||
|
return new Blob([data], { type: "audio/wav" });
|
||||||
|
} catch (e) {
|
||||||
|
toast.error(t("transcodeError"));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const transcribe = async (
|
||||||
|
mediaSrc: string
|
||||||
|
): Promise<{
|
||||||
|
engine: string;
|
||||||
|
model: string;
|
||||||
|
result: TranscriptionResultSegmentGroupType[];
|
||||||
|
}> => {
|
||||||
|
const blob = await transcode(mediaSrc);
|
||||||
|
|
||||||
|
if (whisperConfig.service === "local") {
|
||||||
|
return transcribeByLocal(blob);
|
||||||
|
} else if (whisperConfig.service === "cloudflare") {
|
||||||
|
return transcribeByCloudflareAi(blob);
|
||||||
|
} else if (whisperConfig.service === "openai") {
|
||||||
|
return transcribeByOpenAi(blob);
|
||||||
|
} else if (whisperConfig.service === "azure") {
|
||||||
|
return transcribeByAzureAi(blob);
|
||||||
|
} else {
|
||||||
|
throw new Error(t("whisperServiceNotSupported"));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const transcribeByLocal = async (blob: Blob) => {
|
||||||
|
const res = await EnjoyApp.whisper.transcribe(
|
||||||
|
{
|
||||||
|
blob: {
|
||||||
|
type: blob.type.split(";")[0],
|
||||||
|
arrayBuffer: await blob.arrayBuffer(),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
force: true,
|
||||||
|
extra: ["--prompt", `"Hello! Welcome to listen to this audio."`],
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = groupTranscription(res.transcription);
|
||||||
|
|
||||||
|
return {
|
||||||
|
engine: "whisper",
|
||||||
|
model: res.model.type,
|
||||||
|
result,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
const transcribeByOpenAi = async (blob: Blob) => {
|
||||||
|
if (!openai?.key) {
|
||||||
|
throw new Error(t("openaiKeyRequired"));
|
||||||
|
}
|
||||||
|
|
||||||
|
const client = new OpenAI({
|
||||||
|
apiKey: openai.key,
|
||||||
|
baseURL: openai.baseUrl,
|
||||||
|
dangerouslyAllowBrowser: true,
|
||||||
|
});
|
||||||
|
|
||||||
|
const res: {
|
||||||
|
words: {
|
||||||
|
word: string;
|
||||||
|
start: number;
|
||||||
|
end: number;
|
||||||
|
}[];
|
||||||
|
} = (await client.audio.transcriptions.create({
|
||||||
|
file: new File([blob], "audio.wav"),
|
||||||
|
model: "whisper-1",
|
||||||
|
response_format: "verbose_json",
|
||||||
|
timestamp_granularities: ["word"],
|
||||||
|
})) as any;
|
||||||
|
|
||||||
|
const transcription: TranscriptionResultSegmentType[] = res.words.map(
|
||||||
|
(word) => {
|
||||||
|
return {
|
||||||
|
offsets: {
|
||||||
|
from: word.start * 1000,
|
||||||
|
to: word.end * 1000,
|
||||||
|
},
|
||||||
|
timestamps: {
|
||||||
|
from: milisecondsToTimestamp(word.start * 1000),
|
||||||
|
to: milisecondsToTimestamp(word.end * 1000),
|
||||||
|
},
|
||||||
|
text: word.word,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = groupTranscription(transcription);
|
||||||
|
|
||||||
|
return {
|
||||||
|
engine: "openai",
|
||||||
|
model: "whisper-1",
|
||||||
|
result,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
const transcribeByCloudflareAi = async (blob: Blob) => {
|
||||||
|
const res: CfWhipserOutputType = (
|
||||||
|
await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, blob, {
|
||||||
|
headers: {
|
||||||
|
Authorization: `Bearer ${user.accessToken}`,
|
||||||
|
},
|
||||||
|
timeout: 1000 * 60 * 5,
|
||||||
|
})
|
||||||
|
).data;
|
||||||
|
const transcription: TranscriptionResultSegmentType[] = res.words.map(
|
||||||
|
(word) => {
|
||||||
|
return {
|
||||||
|
offsets: {
|
||||||
|
from: word.start * 1000,
|
||||||
|
to: word.end * 1000,
|
||||||
|
},
|
||||||
|
timestamps: {
|
||||||
|
from: milisecondsToTimestamp(word.start * 1000),
|
||||||
|
to: milisecondsToTimestamp(word.end * 1000),
|
||||||
|
},
|
||||||
|
text: word.word,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
const result = groupTranscription(transcription);
|
||||||
|
|
||||||
|
return {
|
||||||
|
engine: "cloudflare",
|
||||||
|
model: "@cf/openai/whisper",
|
||||||
|
result,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
const transcribeByAzureAi = async (
|
||||||
|
blob: Blob
|
||||||
|
): Promise<{
|
||||||
|
engine: string;
|
||||||
|
model: string;
|
||||||
|
result: TranscriptionResultSegmentGroupType[];
|
||||||
|
}> => {
|
||||||
|
const { token, region } = await webApi.generateSpeechToken();
|
||||||
|
const config = sdk.SpeechConfig.fromAuthorizationToken(token, region);
|
||||||
|
const audioConfig = sdk.AudioConfig.fromWavFileInput(
|
||||||
|
new File([blob], "audio.wav")
|
||||||
|
);
|
||||||
|
// setting the recognition language to English.
|
||||||
|
config.speechRecognitionLanguage = "en-US";
|
||||||
|
config.requestWordLevelTimestamps();
|
||||||
|
config.outputFormat = sdk.OutputFormat.Detailed;
|
||||||
|
|
||||||
|
// create the speech recognizer.
|
||||||
|
const reco = new sdk.SpeechRecognizer(config, audioConfig);
|
||||||
|
|
||||||
|
let results: SpeechRecognitionResultType[] = [];
|
||||||
|
|
||||||
|
return new Promise((resolve, reject) => {
|
||||||
|
reco.recognizing = (_s, e) => {
|
||||||
|
console.log(e.result.text);
|
||||||
|
};
|
||||||
|
|
||||||
|
reco.recognized = (_s, e) => {
|
||||||
|
const json = e.result.properties.getProperty(
|
||||||
|
sdk.PropertyId.SpeechServiceResponse_JsonResult
|
||||||
|
);
|
||||||
|
const result = JSON.parse(json);
|
||||||
|
results = results.concat(result);
|
||||||
|
};
|
||||||
|
|
||||||
|
reco.canceled = (_s, e) => {
|
||||||
|
if (e.reason === sdk.CancellationReason.Error) {
|
||||||
|
return reject(new Error(e.errorDetails));
|
||||||
|
}
|
||||||
|
|
||||||
|
reco.stopContinuousRecognitionAsync();
|
||||||
|
};
|
||||||
|
|
||||||
|
reco.sessionStopped = (_s, _e) => {
|
||||||
|
reco.stopContinuousRecognitionAsync();
|
||||||
|
|
||||||
|
const transcription: TranscriptionResultSegmentType[] = [];
|
||||||
|
|
||||||
|
results.forEach((result) => {
|
||||||
|
const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
|
||||||
|
const words = best.Display.trim().split(" ");
|
||||||
|
|
||||||
|
best.Words.map((word, index) => {
|
||||||
|
let text = word.Word;
|
||||||
|
if (words.length === best.Words.length) {
|
||||||
|
text = words[index];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
index === best.Words.length - 1 &&
|
||||||
|
!text.trim().match(END_OF_WORD_REGEX)
|
||||||
|
) {
|
||||||
|
text = text + ".";
|
||||||
|
}
|
||||||
|
|
||||||
|
transcription.push({
|
||||||
|
offsets: {
|
||||||
|
from: word.Offset / 1e4,
|
||||||
|
to: (word.Offset + word.Duration) / 1e4,
|
||||||
|
},
|
||||||
|
timestamps: {
|
||||||
|
from: milisecondsToTimestamp(word.Offset / 1e4),
|
||||||
|
to: milisecondsToTimestamp((word.Offset + word.Duration) * 1e4),
|
||||||
|
},
|
||||||
|
text,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
resolve({
|
||||||
|
engine: "azure",
|
||||||
|
model: "whisper",
|
||||||
|
result: groupTranscription(transcription),
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
reco.startContinuousRecognitionAsync();
|
||||||
|
});
|
||||||
|
};
|
||||||
|
|
||||||
|
return {
|
||||||
|
transcode,
|
||||||
|
transcribe,
|
||||||
|
};
|
||||||
|
};
|
||||||
19
enjoy/src/types/enjoy-app.d.ts
vendored
19
enjoy/src/types/enjoy-app.d.ts
vendored
@@ -213,10 +213,18 @@ type EnjoyAppType = {
|
|||||||
setService: (
|
setService: (
|
||||||
service: WhisperConfigType["service"]
|
service: WhisperConfigType["service"]
|
||||||
) => Promise<WhisperConfigType>;
|
) => Promise<WhisperConfigType>;
|
||||||
transcribeBlob: (
|
transcribe: (
|
||||||
blob: { type: string; arrayBuffer: ArrayBuffer },
|
params: {
|
||||||
prompt?: string
|
file?: string;
|
||||||
) => Promise<{ file: string; content: string }>;
|
blob?: { type: string; arrayBuffer: ArrayBuffer };
|
||||||
|
},
|
||||||
|
options?: {
|
||||||
|
force?: boolean;
|
||||||
|
extra?: string[];
|
||||||
|
}
|
||||||
|
) => Promise<Partial<WhisperOutputType>>;
|
||||||
|
onProgress: (callback: (event, progress: number) => void) => void;
|
||||||
|
removeProgressListeners: () => Promise<void>;
|
||||||
};
|
};
|
||||||
ffmpeg: {
|
ffmpeg: {
|
||||||
config: () => Promise<FfmpegConfigType>;
|
config: () => Promise<FfmpegConfigType>;
|
||||||
@@ -245,10 +253,7 @@ type EnjoyAppType = {
|
|||||||
};
|
};
|
||||||
transcriptions: {
|
transcriptions: {
|
||||||
findOrCreate: (params: any) => Promise<TranscriptionType>;
|
findOrCreate: (params: any) => Promise<TranscriptionType>;
|
||||||
process: (params: any, options: any) => Promise<void>;
|
|
||||||
update: (id: string, params: any) => Promise<void>;
|
update: (id: string, params: any) => Promise<void>;
|
||||||
onProgress: (callback: (event, progress: number) => void) => void;
|
|
||||||
removeProgressListeners: () => Promise<void>;
|
|
||||||
};
|
};
|
||||||
waveforms: {
|
waveforms: {
|
||||||
find: (id: string) => Promise<WaveFormDataType>;
|
find: (id: string) => Promise<WaveFormDataType>;
|
||||||
|
|||||||
2
enjoy/src/types/index.d.ts
vendored
2
enjoy/src/types/index.d.ts
vendored
@@ -27,7 +27,7 @@ type NotificationType = {
|
|||||||
};
|
};
|
||||||
|
|
||||||
type WhisperConfigType = {
|
type WhisperConfigType = {
|
||||||
service: "local" | "azure" | "cloudflare";
|
service: "local" | "azure" | "cloudflare" | "openai";
|
||||||
availableModels: {
|
availableModels: {
|
||||||
type: string;
|
type: string;
|
||||||
name: string;
|
name: string;
|
||||||
|
|||||||
@@ -1,43 +1,5 @@
|
|||||||
import { createHash } from "crypto";
|
|
||||||
import { createReadStream } from "fs";
|
|
||||||
import Pitchfinder from "pitchfinder";
|
import Pitchfinder from "pitchfinder";
|
||||||
|
|
||||||
export function hashFile(
|
|
||||||
path: string,
|
|
||||||
options: { algo: string }
|
|
||||||
): Promise<string> {
|
|
||||||
const algo = options.algo || "md5";
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
const hash = createHash(algo);
|
|
||||||
const stream = createReadStream(path);
|
|
||||||
stream.on("error", reject);
|
|
||||||
stream.on("data", (chunk) => hash.update(chunk));
|
|
||||||
stream.on("end", () => resolve(hash.digest("hex")));
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
export function hashBlob(
|
|
||||||
blob: Blob,
|
|
||||||
options: { algo: string }
|
|
||||||
): Promise<string> {
|
|
||||||
const algo = options.algo || "md5";
|
|
||||||
return new Promise((resolve, reject) => {
|
|
||||||
const hash = createHash(algo);
|
|
||||||
const reader = new FileReader();
|
|
||||||
reader.onload = () => {
|
|
||||||
if (reader.result instanceof ArrayBuffer) {
|
|
||||||
const buffer = Buffer.from(reader.result);
|
|
||||||
hash.update(buffer);
|
|
||||||
resolve(hash.digest("hex"));
|
|
||||||
} else {
|
|
||||||
reject(new Error("Unexpected result from FileReader"));
|
|
||||||
}
|
|
||||||
};
|
|
||||||
reader.onerror = reject;
|
|
||||||
reader.readAsArrayBuffer(blob);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
export function generatePitch(peaks: Float32Array, sampleRate: number) {
|
export function generatePitch(peaks: Float32Array, sampleRate: number) {
|
||||||
const detectPitch = Pitchfinder.YIN({ sampleRate });
|
const detectPitch = Pitchfinder.YIN({ sampleRate });
|
||||||
const duration = peaks.length / sampleRate;
|
const duration = peaks.length / sampleRate;
|
||||||
@@ -77,3 +39,56 @@ export function milisecondsToTimestamp(ms: number) {
|
|||||||
"0"
|
"0"
|
||||||
)}:${seconds.padStart(2, "0")},${milliseconds}`;
|
)}:${seconds.padStart(2, "0")},${milliseconds}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export const MAGIC_TOKENS = ["Mrs.", "Ms.", "Mr.", "Dr.", "Prof.", "St."];
|
||||||
|
export const END_OF_WORD_REGEX = /[^\.!,\?][\.!\?]/g;
|
||||||
|
export const groupTranscription = (
|
||||||
|
transcription: TranscriptionResultSegmentType[]
|
||||||
|
): TranscriptionResultSegmentGroupType[] => {
|
||||||
|
const generateGroup = (group?: TranscriptionResultSegmentType[]) => {
|
||||||
|
if (!group || group.length === 0) return;
|
||||||
|
|
||||||
|
const firstWord = group[0];
|
||||||
|
const lastWord = group[group.length - 1];
|
||||||
|
|
||||||
|
return {
|
||||||
|
offsets: {
|
||||||
|
from: firstWord.offsets.from,
|
||||||
|
to: lastWord.offsets.to,
|
||||||
|
},
|
||||||
|
text: group.map((w) => w.text.trim()).join(" "),
|
||||||
|
timestamps: {
|
||||||
|
from: firstWord.timestamps.from,
|
||||||
|
to: lastWord.timestamps.to,
|
||||||
|
},
|
||||||
|
segments: group,
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
const groups: TranscriptionResultSegmentGroupType[] = [];
|
||||||
|
let group: TranscriptionResultSegmentType[] = [];
|
||||||
|
|
||||||
|
transcription.forEach((segment) => {
|
||||||
|
const text = segment.text.trim();
|
||||||
|
if (!text) return;
|
||||||
|
|
||||||
|
group.push(segment);
|
||||||
|
|
||||||
|
if (
|
||||||
|
!MAGIC_TOKENS.includes(text) &&
|
||||||
|
segment.text.trim().match(END_OF_WORD_REGEX)
|
||||||
|
) {
|
||||||
|
// Group a complete sentence;
|
||||||
|
groups.push(generateGroup(group));
|
||||||
|
|
||||||
|
// init a new group
|
||||||
|
group = [];
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// Group the last group
|
||||||
|
const lastSentence = generateGroup(group);
|
||||||
|
if (lastSentence) groups.push(lastSentence);
|
||||||
|
|
||||||
|
return groups;
|
||||||
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user