Feat: refactor STT service (#294)

* add stt hook interface * fix crypto exported to browser * refactor use-transcribe * may use openai stt * refactor: remove decprecated codes * fix undefined method
2024-02-10 19:55:07 +08:00
parent a71671907e
commit bc22a5e2b4
21 changed files with 488 additions and 633 deletions
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -317,6 +317,7 @@
  "azureSpeechToTextDescription": "Use Azure AI Speech to transcribe. It is a paid service.",
  "cloudflareAi": "Cloudflare AI",
  "cloudflareSpeechToTextDescription": "Use Cloudflare AI Worker to transcribe. It is in beta and free for now.",
  "openaiSpeechToTextDescription": "Use openAI to transcribe using your own key.",
  "checkingWhisper": "Checking whisper status",
  "pleaseDownloadWhisperModelFirst": "Please download whisper model first",
  "whisperIsWorkingGood": "Whisper is working good",
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -316,6 +316,7 @@
  "azureSpeechToTextDescription": "使用 Azure AI Speech 进行语音转文本，收费服务",
  "cloudflareAi": "Cloudflare AI",
  "cloudflareSpeechToTextDescription": "使用 Cloudflare AI 进行语音转文本，目前免费",
  "openaiSpeechToTextDescription": "使用 OpenAI 进行语音转文本（需要 API 密钥）",
  "checkingWhisper": "正在检查 Whisper",
  "pleaseDownloadWhisperModelFirst": "请先下载 Whisper 模型",
  "whisperIsWorkingGood": "Whisper 正常工作",
--- a/enjoy/src/main/db/handlers/speeches-handler.ts
+++ b/enjoy/src/main/db/handlers/speeches-handler.ts
@@ -3,7 +3,7 @@ import { Speech } from "@main/db/models";
 import fs from "fs-extra";
 import path from "path";
 import settings from "@main/settings";
-import { hashFile } from "@/utils";
+import { hashFile } from "@main/utils";
 class SpeechesHandler {
  private async create(
--- a/enjoy/src/main/db/handlers/transcriptions-handler.ts
+++ b/enjoy/src/main/db/handlers/transcriptions-handler.ts
@@ -1,7 +1,6 @@
 import { ipcMain, IpcMainEvent } from "electron";
 import { Transcription, Audio, Video } from "@main/db/models";
-import { WhereOptions, Attributes } from "sequelize";
+import { Attributes } from "sequelize";
 import { t } from "i18next";
 import log from "electron-log/main";
 const logger = log.scope("db/handlers/transcriptions-handler");
@@ -44,7 +43,7 @@ class TranscriptionsHandler {
    id: string,
    params: Attributes<Transcription>
  ) {
-    const { result } = params;
+    const { result, engine, model, state } = params;
    return Transcription.findOne({
      where: { id },
@@ -53,63 +52,7 @@ class TranscriptionsHandler {
        if (!transcription) {
          throw new Error("models.transcription.notFound");
        }
-        transcription.update({ result });
+        transcription.update({ result, engine, model, state });
      })
      .catch((err) => {
        logger.error(err);
        event.sender.send("on-notification", {
          type: "error",
          message: err.message,
        });
      });
  }
  private async process(
    event: IpcMainEvent,
    where: WhereOptions<Attributes<Transcription>>,
    options?: {
      force?: boolean;
      blob: {
        type: string;
        arrayBuffer: ArrayBuffer;
      };
    }
  ) {
    const { force = true, blob } = options || {};
    return Transcription.findOne({
      where: {
        ...where,
      },
    })
      .then((transcription) => {
        if (!transcription) {
          throw new Error("models.transcription.notFound");
        }
        const interval = setInterval(() => {
          event.sender.send("on-notification", {
            type: "warning",
            message: t("stillTranscribing"),
          });
        }, 1000 * 10);
        transcription
          .process({
            force,
            wavFileBlob: blob,
            onProgress: (progress: number) => {
              event.sender.send("transcription-on-progress", progress);
            },
          })
          .catch((err) => {
            event.sender.send("on-notification", {
              type: "error",
              message: err.message,
            });
          })
          .finally(() => {
            clearInterval(interval);
          });
      })
      .catch((err) => {
        logger.error(err);
@@ -122,7 +65,6 @@ class TranscriptionsHandler {
  register() {
    ipcMain.handle("transcriptions-find-or-create", this.findOrCreate);
    ipcMain.handle("transcriptions-process", this.process);
    ipcMain.handle("transcriptions-update", this.update);
  }
 }
--- a/enjoy/src/main/db/models/audio.ts
+++ b/enjoy/src/main/db/models/audio.ts
@@ -17,7 +17,7 @@ import {
 import { Recording, Speech, Transcription, Video } from "@main/db/models";
 import settings from "@main/settings";
 import { AudioFormats, VideoFormats, WEB_API_URL } from "@/constants";
-import { hashFile } from "@/utils";
+import { hashFile } from "@main/utils";
 import path from "path";
 import fs from "fs-extra";
 import { t } from "i18next";
@@ -191,15 +191,6 @@ export class Audio extends Model<Audio> {
    }
  }
  @AfterCreate
  static transcribeAsync(audio: Audio) {
    if (settings.ffmpegConfig().ready) {
      setTimeout(() => {
        audio.transcribe();
      }, 500);
    }
  }
  @AfterCreate
  static autoSync(audio: Audio) {
    // auto sync should not block the main thread
@@ -332,38 +323,6 @@ export class Audio extends Model<Audio> {
    });
  }
  // STT using whisper
  async transcribe() {
    Transcription.findOrCreate({
      where: {
        targetId: this.id,
        targetType: "Audio",
      },
      defaults: {
        targetId: this.id,
        targetType: "Audio",
        targetMd5: this.md5,
      },
    })
      .then(([transcription, _created]) => {
        if (transcription.state === "pending") {
          transcription.process();
        } else if (transcription.state === "finished") {
          transcription.process({ force: true });
        } else if (transcription.state === "processing") {
          logger.warn(
            `[${transcription.getDataValue("id")}]`,
            "Transcription is processing."
          );
        }
      })
      .catch((err) => {
        logger.error(err);
        throw err;
      });
  }
  static notify(audio: Audio, action: "create" | "update" | "destroy") {
    if (!mainWindow.win) return;
--- a/enjoy/src/main/db/models/conversation.ts
+++ b/enjoy/src/main/db/models/conversation.ts
@@ -29,7 +29,7 @@ import fs from "fs-extra";
 import path from "path";
 import Ffmpeg from "@main/ffmpeg";
 import whisper from "@main/whisper";
-import { hashFile } from "@/utils";
+import { hashFile } from "@main/utils";
 import { WEB_API_URL } from "@/constants";
 import proxyAgent from "@main/proxy-agent";
--- a/enjoy/src/main/db/models/recording.ts
+++ b/enjoy/src/main/db/models/recording.ts
@@ -19,7 +19,7 @@ import { Audio, PronunciationAssessment, Video } from "@main/db/models";
 import fs from "fs-extra";
 import path from "path";
 import settings from "@main/settings";
-import { hashFile } from "@/utils";
+import { hashFile } from "@main/utils";
 import log from "electron-log/main";
 import storage from "@main/storage";
 import { Client } from "@/api";
--- a/enjoy/src/main/db/models/speech.ts
+++ b/enjoy/src/main/db/models/speech.ts
@@ -20,7 +20,7 @@ import path from "path";
 import settings from "@main/settings";
 import OpenAI, { type ClientOptions } from "openai";
 import { t } from "i18next";
-import { hashFile } from "@/utils";
+import { hashFile } from "@main/utils";
 import { Audio, Message } from "@main/db/models";
 import log from "electron-log/main";
 import { WEB_API_URL } from "@/constants";
--- a/enjoy/src/main/db/models/transcription.ts
+++ b/enjoy/src/main/db/models/transcription.ts
@@ -1,5 +1,4 @@
 import {
  AfterCreate,
  AfterUpdate,
  AfterDestroy,
  AfterFind,
@@ -13,18 +12,13 @@ import {
  Unique,
 } from "sequelize-typescript";
 import { Audio, Video } from "@main/db/models";
 import whisper from "@main/whisper";
 import mainWindow from "@main/window";
 import log from "electron-log/main";
 import { Client } from "@/api";
 import { WEB_API_URL, PROCESS_TIMEOUT } from "@/constants";
 import settings from "@main/settings";
 import Ffmpeg from "@main/ffmpeg";
 import path from "path";
 import fs from "fs-extra";
 const logger = log.scope("db/models/transcription");
@Table({
  modelName: "Transcription",
  tableName: "transcriptions",
@@ -80,120 +74,13 @@ export class Transcription extends Model<Transcription> {
    const webApi = new Client({
      baseUrl: process.env.WEB_API_URL || WEB_API_URL,
      accessToken: settings.getSync("user.accessToken") as string,
-      logger: log.scope("api/client"),
+      logger,
    });
    return webApi.syncTranscription(this.toJSON()).then(() => {
      this.update({ syncedAt: new Date() });
    });
  }
  // STT using whisper
  async process(
    options: {
      force?: boolean;
      wavFileBlob?: { type: string; arrayBuffer: ArrayBuffer };
      onProgress?: (progress: number) => void;
    } = {}
  ) {
    if (this.getDataValue("state") === "processing") return;
    const { force = false, wavFileBlob, onProgress } = options;
    logger.info(`[${this.getDataValue("id")}]`, "Start to transcribe.");
    let filePath = "";
    if (this.targetType === "Audio") {
      filePath = (await Audio.findByPk(this.targetId)).filePath;
    } else if (this.targetType === "Video") {
      filePath = (await Video.findByPk(this.targetId)).filePath;
    }
    if (!filePath) {
      logger.error(`[${this.getDataValue("id")}]`, "No file path.");
      throw new Error("No file path.");
    }
    let wavFile: string = filePath;
    const tmpDir = settings.cachePath();
    const outputFile = path.join(
      tmpDir,
      path.basename(filePath, path.extname(filePath)) + ".wav"
    );
    if (wavFileBlob) {
      const format = wavFileBlob.type.split("/")[1];
      if (format !== "wav") {
        throw new Error("Only wav format is supported");
      }
      await fs.outputFile(outputFile, Buffer.from(wavFileBlob.arrayBuffer));
      wavFile = outputFile;
    } else if (settings.ffmpegConfig().ready) {
      const ffmpeg = new Ffmpeg();
      try {
        wavFile = await ffmpeg.prepareForWhisper(
          filePath,
          path.join(
            tmpDir,
            path.basename(filePath, path.extname(filePath)) + ".wav"
          )
        );
      } catch (err) {
        logger.error("ffmpeg error", err);
      }
    }
    try {
      await this.update({
        state: "processing",
      });
      const {
        engine = "whisper",
        model,
        transcription,
      } = await whisper.transcribe(wavFile, {
        force,
        extra: [
          "--split-on-word",
          "--max-len",
          "1",
          "--prompt",
          `"Hello! Welcome to listen to this audio."`,
        ],
        onProgress,
      });
      const result = whisper.groupTranscription(transcription);
      this.update({
        engine,
        model: model?.type,
        result,
        state: "finished",
      }).then(() => this.sync());
      logger.info(`[${this.getDataValue("id")}]`, "Transcription finished.");
    } catch (err) {
      logger.error(
        `[${this.getDataValue("id")}]`,
        "Transcription not finished.",
        err
      );
      this.update({
        state: "pending",
      });
      throw err;
    }
  }
  @AfterCreate
  static startTranscribeAsync(transcription: Transcription) {
    setTimeout(() => {
      transcription.process();
    }, 0);
  }
  @AfterUpdate
  static notifyForUpdate(transcription: Transcription) {
    this.notify(transcription, "update");
--- a/enjoy/src/main/db/models/video.ts
+++ b/enjoy/src/main/db/models/video.ts
@@ -17,7 +17,7 @@ import {
 import { Audio, Recording, Speech, Transcription } from "@main/db/models";
 import settings from "@main/settings";
 import { AudioFormats, VideoFormats, WEB_API_URL } from "@/constants";
-import { hashFile } from "@/utils";
+import { hashFile } from "@main/utils";
 import path from "path";
 import fs from "fs-extra";
 import { t } from "i18next";
@@ -213,15 +213,6 @@ export class Video extends Model<Video> {
    }
  }
  @AfterCreate
  static transcribeAsync(video: Video) {
    if (settings.ffmpegConfig().ready) {
      setTimeout(() => {
        video.transcribe();
      }, 500);
    }
  }
  @AfterCreate
  static autoSync(video: Video) {
    // auto sync should not block the main thread
@@ -355,37 +346,6 @@ export class Video extends Model<Video> {
    });
  }
  async transcribe() {
    Transcription.findOrCreate({
      where: {
        targetId: this.id,
        targetType: "Video",
      },
      defaults: {
        targetId: this.id,
        targetType: "Video",
        targetMd5: this.md5,
      },
    })
      .then(([transcription, _created]) => {
        if (transcription.state === "pending") {
          transcription.process();
        } else if (transcription.state === "finished") {
          transcription.process({ force: true });
        } else if (transcription.state === "processing") {
          logger.warn(
            `[${transcription.getDataValue("id")}]`,
            "Transcription is processing."
          );
        }
      })
      .catch((err) => {
        logger.error(err);
        throw err;
      });
  }
  static notify(video: Video, action: "create" | "update" | "destroy") {
    if (!mainWindow.win) return;
--- a/enjoy/src/main/utils.ts
+++ b/enjoy/src/main/utils.ts
@@ -0,0 +1,38 @@
 import { createHash } from "crypto";
 import { createReadStream } from "fs";
 export function hashFile(
  path: string,
  options: { algo: string }
 ): Promise<string> {
  const algo = options.algo || "md5";
  return new Promise((resolve, reject) => {
    const hash = createHash(algo);
    const stream = createReadStream(path);
    stream.on("error", reject);
    stream.on("data", (chunk) => hash.update(chunk));
    stream.on("end", () => resolve(hash.digest("hex")));
  });
 }
 export function hashBlob(
  blob: Blob,
  options: { algo: string }
 ): Promise<string> {
  const algo = options.algo || "md5";
  return new Promise((resolve, reject) => {
    const hash = createHash(algo);
    const reader = new FileReader();
    reader.onload = () => {
      if (reader.result instanceof ArrayBuffer) {
        const buffer = Buffer.from(reader.result);
        hash.update(buffer);
        resolve(hash.digest("hex"));
      } else {
        reject(new Error("Unexpected result from FileReader"));
      }
    };
    reader.onerror = reject;
    reader.readAsArrayBuffer(blob);
  });
 }
--- a/enjoy/src/main/whisper.ts
+++ b/enjoy/src/main/whisper.ts
@@ -152,162 +152,17 @@ class Whipser {
    });
  }
  async transcribeBlob(
    blob: { type: string; arrayBuffer: ArrayBuffer },
    options?: {
      prompt?: string;
      group?: boolean;
    }
  ): Promise<
    TranscriptionResultSegmentType[] | TranscriptionResultSegmentGroupType[]
  > {
    const { prompt, group = false } = options || {};
    const format = blob.type.split("/")[1];
    if (format !== "wav") {
      throw new Error("Only wav format is supported");
    }
    const tempfile = path.join(settings.cachePath(), `${Date.now()}.${format}`);
    await fs.outputFile(tempfile, Buffer.from(blob.arrayBuffer));
    const extra = [];
    if (prompt) {
      extra.push(`--prompt "${prompt.replace(/"/g, '\\"')}"`);
    }
    const { transcription } = await this.transcribe(tempfile, {
      force: true,
      extra,
    });
    if (group) {
      return this.groupTranscription(transcription);
    } else {
      return transcription;
    }
  }
  async transcribe(
    file: string,
    options?: {
      force?: boolean;
      extra?: string[];
      onProgress?: (progress: number) => void;
    }
  ): Promise<Partial<WhisperOutputType>> {
    if (this.config.service === "local") {
      return this.transcribeFromLocal(file, options);
    } else if (this.config.service === "azure") {
      return this.transcribeFromAzure(file);
    } else if (this.config.service === "cloudflare") {
      return this.transcribeFromCloudflare(file);
    } else {
      throw new Error("Unknown service");
    }
  }
  async transcribeFromAzure(file: string): Promise<Partial<WhisperOutputType>> {
    const webApi = new Client({
      baseUrl: process.env.WEB_API_URL || WEB_API_URL,
      accessToken: settings.getSync("user.accessToken") as string,
      logger: log.scope("api/client"),
    });
    const { token, region } = await webApi.generateSpeechToken();
    const sdk = new AzureSpeechSdk(token, region);
    const results = await sdk.transcribe({
      filePath: file,
    });
    const transcription: TranscriptionResultSegmentType[] = [];
    results.forEach((result) => {
      logger.debug(result);
      const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
      const words = best.Display.trim().split(" ");
      best.Words.map((word, index) => {
        let text = word.Word;
        if (words.length === best.Words.length) {
          text = words[index];
        }
        if (
          index === best.Words.length - 1 &&
          !text.trim().match(END_OF_WORD_REGEX)
        ) {
          text = text + ".";
        }
        transcription.push({
          offsets: {
            from: word.Offset / 1e4,
            to: (word.Offset + word.Duration) / 1e4,
          },
          timestamps: {
            from: milisecondsToTimestamp(word.Offset / 1e4),
            to: milisecondsToTimestamp((word.Offset + word.Duration) * 1e4),
          },
          text,
        });
      });
    });
    return {
      engine: "azure",
      model: {
        type: "Azure AI Speech",
      },
      transcription,
    };
  }
  async transcribeFromCloudflare(
    file: string
  ): Promise<Partial<WhisperOutputType>> {
    logger.debug("transcribing from CloudFlare");
    const data = fs.readFileSync(file);
    const res: CfWhipserOutputType = (
      await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, data, {
        headers: {
          Authorization: `Bearer ${settings.getSync("user.accessToken")}`,
        },
      })
    ).data;
    logger.debug("transcription from Web,", res);
    const transcription: TranscriptionResultSegmentType[] = res.words.map(
      (word) => {
        return {
          offsets: {
            from: word.start * 1000,
            to: word.end * 1000,
          },
          timestamps: {
            from: milisecondsToTimestamp(word.start * 1000),
            to: milisecondsToTimestamp(word.end * 1000),
          },
          text: word.word,
        };
      }
    );
    logger.debug("converted transcription,", transcription);
    return {
      engine: "cloudflare",
      model: {
        type: "@cf/openai/whisper",
      },
      transcription,
    };
  }
  /* Ensure the file is in wav format
   * and 16kHz sample rate
   */
-  async transcribeFromLocal(
+  async transcribe(
-    file: string,
+    params: {
      file?: string;
      blob?: {
        type: string;
        arrayBuffer: ArrayBuffer;
      };
    },
    options?: {
      force?: boolean;
      extra?: string[];
@@ -315,6 +170,28 @@ class Whipser {
    }
  ): Promise<Partial<WhisperOutputType>> {
    logger.debug("transcribing from local");
    const { blob } = params;
    let { file } = params;
    if (!file && !blob) {
      throw new Error("No file or blob provided");
    }
    if (!this.currentModel()) {
      throw new Error(t("pleaseDownloadWhisperModelFirst"));
    }
    if (blob) {
      const format = blob.type.split("/")[1];
      if (format !== "wav") {
        throw new Error("Only wav format is supported");
      }
      file = path.join(settings.cachePath(), `${Date.now()}.${format}`);
      await fs.outputFile(file, Buffer.from(blob.arrayBuffer));
    }
    const { force = false, extra = [], onProgress } = options || {};
    const filename = path.basename(file, path.extname(file));
    const tmpDir = settings.cachePath();
@@ -326,46 +203,35 @@ class Whipser {
      return fs.readJson(outputFile);
    }
-    if (!this.currentModel()) {
+    const commandArguments = [
-      throw new Error(t("pleaseDownloadWhisperModelFirst"));
+      "--file",
-    }
+      file,
-
+      "--model",
-    const command = [
+      this.currentModel(),
      `"${this.binMain}"`,
      `--file "${file}"`,
      `--model "${this.currentModel()}"`,
      "--output-json",
-      `--output-file "${path.join(tmpDir, filename)}"`,
+      "--output-file",
      path.join(tmpDir, filename),
      "-pp",
      "--split-on-word",
      "--max-len",
      "1",
      ...extra,
-    ].join(" ");
+    ];
-    logger.info(`Running command: ${command}`);
+    logger.info(
-
+      `Running command: ${this.binMain} ${commandArguments.join(" ")}`
    const transcribe = spawn(
      this.binMain,
      [
        "--file",
        file,
        "--model",
        this.currentModel(),
        "--output-json",
        "--output-file",
        path.join(tmpDir, filename),
        "-pp",
        ...extra,
      ],
      {
        timeout: PROCESS_TIMEOUT,
      }
    );
    const command = spawn(this.binMain, commandArguments, {
      timeout: PROCESS_TIMEOUT,
    });
    return new Promise((resolve, reject) => {
-      transcribe.stdout.on("data", (data) => {
+      command.stdout.on("data", (data) => {
        logger.debug(`stdout: ${data}`);
      });
-      transcribe.stderr.on("data", (data) => {
+      command.stderr.on("data", (data) => {
        const output = data.toString();
        logger.error(`stderr: ${output}`);
        if (output.startsWith("whisper_print_progress_callback")) {
@@ -374,16 +240,16 @@ class Whipser {
        }
      });
-      transcribe.on("exit", (code) => {
+      command.on("exit", (code) => {
        logger.info(`transcribe process exited with code ${code}`);
      });
-      transcribe.on("error", (err) => {
+      command.on("error", (err) => {
        logger.error("transcribe error", err.message);
        reject(err);
      });
-      transcribe.on("close", () => {
+      command.on("close", () => {
        if (fs.pathExistsSync(outputFile)) {
          resolve(fs.readJson(outputFile));
        } else {
@@ -393,57 +259,6 @@ class Whipser {
    });
  }
  groupTranscription(
    transcription: TranscriptionResultSegmentType[]
  ): TranscriptionResultSegmentGroupType[] {
    const generateGroup = (group?: TranscriptionResultSegmentType[]) => {
      if (!group || group.length === 0) return;
      const firstWord = group[0];
      const lastWord = group[group.length - 1];
      return {
        offsets: {
          from: firstWord.offsets.from,
          to: lastWord.offsets.to,
        },
        text: group.map((w) => w.text.trim()).join(" "),
        timestamps: {
          from: firstWord.timestamps.from,
          to: lastWord.timestamps.to,
        },
        segments: group,
      };
    };
    const groups: TranscriptionResultSegmentGroupType[] = [];
    let group: TranscriptionResultSegmentType[] = [];
    transcription.forEach((segment) => {
      const text = segment.text.trim();
      if (!text) return;
      group.push(segment);
      if (
        !MAGIC_TOKENS.includes(text) &&
        segment.text.trim().match(END_OF_WORD_REGEX)
      ) {
        // Group a complete sentence;
        groups.push(generateGroup(group));
        // init a new group
        group = [];
      }
    });
    // Group the last group
    const lastSentence = generateGroup(group);
    if (lastSentence) groups.push(lastSentence);
    return groups;
  }
  registerIpcHandlers() {
    ipcMain.handle("whisper-config", async () => {
      try {
@@ -489,7 +304,7 @@ class Whipser {
            message: err.message,
          });
        }
-      } else if (["cloudflare", "azure"].includes(service)) {
+      } else if (["cloudflare", "azure", "openai"].includes(service)) {
        settings.setSync("whisper.service", service);
        this.config.service = service;
        return this.config;
@@ -505,9 +320,14 @@ class Whipser {
      return await this.check();
    });
-    ipcMain.handle("whisper-transcribe-blob", async (event, blob, prompt) => {
+    ipcMain.handle("whisper-transcribe", async (event, params, options) => {
      try {
-        return await this.transcribeBlob(blob, prompt);
+        return await this.transcribe(params, {
          ...options,
          onProgress: (progress) => {
            event.sender.send("whisper-on-progress", progress);
          },
        });
      } catch (err) {
        event.sender.send("on-notification", {
          type: "error",
--- a/enjoy/src/preload.ts
+++ b/enjoy/src/preload.ts
@@ -363,11 +363,26 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
    check: () => {
      return ipcRenderer.invoke("whisper-check");
    },
-    transcribeBlob: (
+    transcribe: (
-      blob: { type: string; arrayBuffer: ArrayBuffer },
+      params: {
-      prompt?: string
+        file?: string;
        blob?: {
          type: string;
          arrayBuffer: ArrayBuffer;
        };
      },
      options?: {
        force?: boolean;
        extra?: string[];
      }
    ) => {
-      return ipcRenderer.invoke("whisper-transcribe-blob", blob, prompt);
+      return ipcRenderer.invoke("whisper-transcribe", params, options);
    },
    onProgress: (
      callback: (event: IpcRendererEvent, progress: number) => void
    ) => ipcRenderer.on("whisper-on-progress", callback),
    removeProgressListeners: () => {
      ipcRenderer.removeAllListeners("whisper-on-progress");
    },
  },
  ffmpeg: {
@@ -425,18 +440,9 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
    findOrCreate: (params: any) => {
      return ipcRenderer.invoke("transcriptions-find-or-create", params);
    },
    process: (params: any, options: any) => {
      return ipcRenderer.invoke("transcriptions-process", params, options);
    },
    update: (id: string, params: any) => {
      return ipcRenderer.invoke("transcriptions-update", id, params);
    },
    onProgress: (
      callback: (event: IpcRendererEvent, progress: number) => void
    ) => ipcRenderer.on("transcription-on-progress", callback),
    removeProgressListeners: () => {
      ipcRenderer.removeAllListeners("transcription-on-progress");
    },
  },
  waveforms: {
    find: (id: string) => {
--- a/enjoy/src/renderer/components/medias/media-transcription.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription.tsx
@@ -12,6 +12,7 @@ import {
  ScrollArea,
  Button,
  PingPoint,
  toast,
 } from "@renderer/components/ui";
 import React, { useEffect, useContext, useState } from "react";
 import { t } from "i18next";
@@ -19,6 +20,7 @@ import { LoaderIcon, CheckCircleIcon, MicIcon } from "lucide-react";
 import {
  DbProviderContext,
  AppSettingsProviderContext,
  AISettingsProviderContext,
 } from "@renderer/context";
 import { useTranscribe } from "@renderer/hooks";
@@ -32,6 +34,7 @@ export const MediaTranscription = (props: {
  onSelectSegment?: (index: number) => void;
 }) => {
  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
  const { whisperConfig } = useContext(AISettingsProviderContext);
  const { EnjoyApp } = useContext(AppSettingsProviderContext);
  const {
    transcription,
@@ -55,13 +58,19 @@ export const MediaTranscription = (props: {
    setTranscribing(true);
    setProgress(0);
-    transcribe({
+    try {
-      mediaId,
+      const { engine, model, result } = await transcribe(mediaUrl);
-      mediaType,
+      await EnjoyApp.transcriptions.update(transcription.id, {
-      mediaSrc: mediaUrl,
+        state: "finished",
-    }).finally(() => {
+        result,
-      setTranscribing(false);
+        engine,
-    });
+        model,
      });
    } catch (err) {
      toast.error(err.message);
    }
    setTranscribing(false);
  };
  const fetchSegmentStats = async () => {
@@ -80,14 +89,16 @@ export const MediaTranscription = (props: {
      generate();
    }
-    EnjoyApp.transcriptions.onProgress((_, p: number) => {
+    if (whisperConfig.service === "local") {
-      if (p > 100) p = 100;
+      EnjoyApp.whisper.onProgress((_, p: number) => {
-      setProgress(p);
+        if (p > 100) p = 100;
-    });
+        setProgress(p);
      });
    }
    return () => {
      removeDbListener(fetchSegmentStats);
-      EnjoyApp.transcriptions.removeProgressListeners();
+      EnjoyApp.whisper.removeProgressListeners();
    };
  }, [mediaId, mediaType]);
@@ -114,7 +125,9 @@ export const MediaTranscription = (props: {
          {transcribing || transcription.state === "processing" ? (
            <>
              <PingPoint colorClassName="bg-yellow-500" />
-              <div className="text-sm">{progress}%</div>
+              <div className="text-sm">
                {whisperConfig.service === "local" && `${progress}%`}
              </div>
            </>
          ) : transcription.state === "finished" ? (
            <CheckCircleIcon className="text-green-500 w-4 h-4" />
--- a/enjoy/src/renderer/components/preferences/whisper-settings.tsx
+++ b/enjoy/src/renderer/components/preferences/whisper-settings.tsx
@@ -77,6 +77,8 @@ export const WhisperSettings = () => {
            t("azureSpeechToTextDescription")}
          {whisperConfig?.service === "cloudflare" &&
            t("cloudflareSpeechToTextDescription")}
          {whisperConfig?.service === "openai" &&
            t("openaiSpeechToTextDescription")}
        </div>
      </div>
@@ -94,6 +96,7 @@ export const WhisperSettings = () => {
            <SelectItem value="local">{t("local")}</SelectItem>
            <SelectItem value="azure">{t("azureAi")}</SelectItem>
            <SelectItem value="cloudflare">{t("cloudflareAi")}</SelectItem>
            <SelectItem value="openai">OpenAI</SelectItem>
          </SelectContent>
        </Select>
--- a/enjoy/src/renderer/hooks/index.ts
+++ b/enjoy/src/renderer/hooks/index.ts
@@ -1,3 +1,3 @@
-export * from './use-transcode';
+export * from './use-transcribe';
 export * from './use-ai-command';
 export * from './use-conversation';
--- a/enjoy/src/renderer/hooks/use-transcode.tsx
+++ b/enjoy/src/renderer/hooks/use-transcode.tsx
@@ -1,58 +0,0 @@
 import { AppSettingsProviderContext } from "@renderer/context";
 import { useContext } from "react";
 import { toast } from "@renderer/components/ui";
 import { t } from "i18next";
 import { fetchFile } from "@ffmpeg/util";
 export const useTranscribe = () => {
  const { EnjoyApp, ffmpeg } = useContext(AppSettingsProviderContext);
  const transcode = async (src: string, options?: string[]) => {
    if (!ffmpeg?.loaded) return;
    options = options || ["-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le"];
    try {
      const uri = new URL(src);
      const input = uri.pathname.split("/").pop();
      const output = input.replace(/\.[^/.]+$/, ".wav");
      await ffmpeg.writeFile(input, await fetchFile(src));
      await ffmpeg.exec(["-i", input, ...options, output]);
      const data = await ffmpeg.readFile(output);
      return new Blob([data], { type: "audio/wav" });
    } catch (e) {
      toast.error(t("transcodeError"));
    }
  };
  const transcribe = async (params: {
    mediaSrc: string;
    mediaId: string;
    mediaType: "Audio" | "Video";
  }) => {
    const { mediaSrc, mediaId, mediaType } = params;
    const data = await transcode(mediaSrc);
    let blob;
    if (data) {
      blob = {
        type: data.type.split(";")[0],
        arrayBuffer: await data.arrayBuffer(),
      };
    }
    return EnjoyApp.transcriptions.process(
      {
        targetId: mediaId,
        targetType: mediaType,
      },
      {
        blob,
      }
    );
  };
  return {
    transcode,
    transcribe,
  };
 };
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -0,0 +1,263 @@
 import {
  AppSettingsProviderContext,
  AISettingsProviderContext,
 } from "@renderer/context";
 import OpenAI from "openai";
 import { useContext } from "react";
 import { toast } from "@renderer/components/ui";
 import { t } from "i18next";
 import { fetchFile } from "@ffmpeg/util";
 import { AI_WORKER_ENDPOINT } from "@/constants";
 import * as sdk from "microsoft-cognitiveservices-speech-sdk";
 import axios from "axios";
 import take from "lodash/take";
 import sortedUniqBy from "lodash/sortedUniqBy";
 import { groupTranscription, END_OF_WORD_REGEX, milisecondsToTimestamp } from "@/utils";
 export const useTranscribe = () => {
  const { EnjoyApp, ffmpeg, user, webApi } = useContext(
    AppSettingsProviderContext
  );
  const { whisperConfig, openai } = useContext(AISettingsProviderContext);
  const transcode = async (src: string, options?: string[]) => {
    if (!ffmpeg?.loaded) return;
    options = options || ["-ar", "16000", "-ac", "1", "-c:a", "pcm_s16le"];
    try {
      const uri = new URL(src);
      const input = uri.pathname.split("/").pop();
      const output = input.replace(/\.[^/.]+$/, ".wav");
      await ffmpeg.writeFile(input, await fetchFile(src));
      await ffmpeg.exec(["-i", input, ...options, output]);
      const data = await ffmpeg.readFile(output);
      return new Blob([data], { type: "audio/wav" });
    } catch (e) {
      toast.error(t("transcodeError"));
    }
  };
  const transcribe = async (
    mediaSrc: string
  ): Promise<{
    engine: string;
    model: string;
    result: TranscriptionResultSegmentGroupType[];
  }> => {
    const blob = await transcode(mediaSrc);
    if (whisperConfig.service === "local") {
      return transcribeByLocal(blob);
    } else if (whisperConfig.service === "cloudflare") {
      return transcribeByCloudflareAi(blob);
    } else if (whisperConfig.service === "openai") {
      return transcribeByOpenAi(blob);
    } else if (whisperConfig.service === "azure") {
      return transcribeByAzureAi(blob);
    } else {
      throw new Error(t("whisperServiceNotSupported"));
    }
  };
  const transcribeByLocal = async (blob: Blob) => {
    const res = await EnjoyApp.whisper.transcribe(
      {
        blob: {
          type: blob.type.split(";")[0],
          arrayBuffer: await blob.arrayBuffer(),
        },
      },
      {
        force: true,
        extra: ["--prompt", `"Hello! Welcome to listen to this audio."`],
      }
    );
    const result = groupTranscription(res.transcription);
    return {
      engine: "whisper",
      model: res.model.type,
      result,
    };
  };
  const transcribeByOpenAi = async (blob: Blob) => {
    if (!openai?.key) {
      throw new Error(t("openaiKeyRequired"));
    }
    const client = new OpenAI({
      apiKey: openai.key,
      baseURL: openai.baseUrl,
      dangerouslyAllowBrowser: true,
    });
    const res: {
      words: {
        word: string;
        start: number;
        end: number;
      }[];
    } = (await client.audio.transcriptions.create({
      file: new File([blob], "audio.wav"),
      model: "whisper-1",
      response_format: "verbose_json",
      timestamp_granularities: ["word"],
    })) as any;
    const transcription: TranscriptionResultSegmentType[] = res.words.map(
      (word) => {
        return {
          offsets: {
            from: word.start * 1000,
            to: word.end * 1000,
          },
          timestamps: {
            from: milisecondsToTimestamp(word.start * 1000),
            to: milisecondsToTimestamp(word.end * 1000),
          },
          text: word.word,
        };
      }
    );
    const result = groupTranscription(transcription);
    return {
      engine: "openai",
      model: "whisper-1",
      result,
    };
  };
  const transcribeByCloudflareAi = async (blob: Blob) => {
    const res: CfWhipserOutputType = (
      await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, blob, {
        headers: {
          Authorization: `Bearer ${user.accessToken}`,
        },
        timeout: 1000 * 60 * 5,
      })
    ).data;
    const transcription: TranscriptionResultSegmentType[] = res.words.map(
      (word) => {
        return {
          offsets: {
            from: word.start * 1000,
            to: word.end * 1000,
          },
          timestamps: {
            from: milisecondsToTimestamp(word.start * 1000),
            to: milisecondsToTimestamp(word.end * 1000),
          },
          text: word.word,
        };
      }
    );
    const result = groupTranscription(transcription);
    return {
      engine: "cloudflare",
      model: "@cf/openai/whisper",
      result,
    };
  };
  const transcribeByAzureAi = async (
    blob: Blob
  ): Promise<{
    engine: string;
    model: string;
    result: TranscriptionResultSegmentGroupType[];
  }> => {
    const { token, region } = await webApi.generateSpeechToken();
    const config = sdk.SpeechConfig.fromAuthorizationToken(token, region);
    const audioConfig = sdk.AudioConfig.fromWavFileInput(
      new File([blob], "audio.wav")
    );
    // setting the recognition language to English.
    config.speechRecognitionLanguage = "en-US";
    config.requestWordLevelTimestamps();
    config.outputFormat = sdk.OutputFormat.Detailed;
    // create the speech recognizer.
    const reco = new sdk.SpeechRecognizer(config, audioConfig);
    let results: SpeechRecognitionResultType[] = [];
    return new Promise((resolve, reject) => {
      reco.recognizing = (_s, e) => {
        console.log(e.result.text);
      };
      reco.recognized = (_s, e) => {
        const json = e.result.properties.getProperty(
          sdk.PropertyId.SpeechServiceResponse_JsonResult
        );
        const result = JSON.parse(json);
        results = results.concat(result);
      };
      reco.canceled = (_s, e) => {
        if (e.reason === sdk.CancellationReason.Error) {
          return reject(new Error(e.errorDetails));
        }
        reco.stopContinuousRecognitionAsync();
      };
      reco.sessionStopped = (_s, _e) => {
        reco.stopContinuousRecognitionAsync();
        const transcription: TranscriptionResultSegmentType[] = [];
        results.forEach((result) => {
          const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
          const words = best.Display.trim().split(" ");
          best.Words.map((word, index) => {
            let text = word.Word;
            if (words.length === best.Words.length) {
              text = words[index];
            }
            if (
              index === best.Words.length - 1 &&
              !text.trim().match(END_OF_WORD_REGEX)
            ) {
              text = text + ".";
            }
            transcription.push({
              offsets: {
                from: word.Offset / 1e4,
                to: (word.Offset + word.Duration) / 1e4,
              },
              timestamps: {
                from: milisecondsToTimestamp(word.Offset / 1e4),
                to: milisecondsToTimestamp((word.Offset + word.Duration) * 1e4),
              },
              text,
            });
          });
        });
        resolve({
          engine: "azure",
          model: "whisper",
          result: groupTranscription(transcription),
        });
      };
      reco.startContinuousRecognitionAsync();
    });
  };
  return {
    transcode,
    transcribe,
  };
 };
--- a/enjoy/src/types/enjoy-app.d.ts
+++ b/enjoy/src/types/enjoy-app.d.ts
@@ -213,10 +213,18 @@ type EnjoyAppType = {
    setService: (
      service: WhisperConfigType["service"]
    ) => Promise<WhisperConfigType>;
-    transcribeBlob: (
+    transcribe: (
-      blob: { type: string; arrayBuffer: ArrayBuffer },
+      params: {
-      prompt?: string
+        file?: string;
-    ) => Promise<{ file: string; content: string }>;
+        blob?: { type: string; arrayBuffer: ArrayBuffer };
      },
      options?: {
        force?: boolean;
        extra?: string[];
      }
    ) => Promise<Partial<WhisperOutputType>>;
    onProgress: (callback: (event, progress: number) => void) => void;
    removeProgressListeners: () => Promise<void>;
  };
  ffmpeg: {
    config: () => Promise<FfmpegConfigType>;
@@ -245,10 +253,7 @@ type EnjoyAppType = {
  };
  transcriptions: {
    findOrCreate: (params: any) => Promise<TranscriptionType>;
    process: (params: any, options: any) => Promise<void>;
    update: (id: string, params: any) => Promise<void>;
    onProgress: (callback: (event, progress: number) => void) => void;
    removeProgressListeners: () => Promise<void>;
  };
  waveforms: {
    find: (id: string) => Promise<WaveFormDataType>;
--- a/enjoy/src/types/index.d.ts
+++ b/enjoy/src/types/index.d.ts
@@ -27,7 +27,7 @@ type NotificationType = {
 };
 type WhisperConfigType = {
-  service: "local" | "azure" | "cloudflare";
+  service: "local" | "azure" | "cloudflare" | "openai";
  availableModels: {
    type: string;
    name: string;
--- a/enjoy/src/utils.ts
+++ b/enjoy/src/utils.ts
@@ -1,43 +1,5 @@
 import { createHash } from "crypto";
 import { createReadStream } from "fs";
 import Pitchfinder from "pitchfinder";
 export function hashFile(
  path: string,
  options: { algo: string }
 ): Promise<string> {
  const algo = options.algo || "md5";
  return new Promise((resolve, reject) => {
    const hash = createHash(algo);
    const stream = createReadStream(path);
    stream.on("error", reject);
    stream.on("data", (chunk) => hash.update(chunk));
    stream.on("end", () => resolve(hash.digest("hex")));
  });
 }
 export function hashBlob(
  blob: Blob,
  options: { algo: string }
 ): Promise<string> {
  const algo = options.algo || "md5";
  return new Promise((resolve, reject) => {
    const hash = createHash(algo);
    const reader = new FileReader();
    reader.onload = () => {
      if (reader.result instanceof ArrayBuffer) {
        const buffer = Buffer.from(reader.result);
        hash.update(buffer);
        resolve(hash.digest("hex"));
      } else {
        reject(new Error("Unexpected result from FileReader"));
      }
    };
    reader.onerror = reject;
    reader.readAsArrayBuffer(blob);
  });
 }
 export function generatePitch(peaks: Float32Array, sampleRate: number) {
  const detectPitch = Pitchfinder.YIN({ sampleRate });
  const duration = peaks.length / sampleRate;
@@ -77,3 +39,56 @@ export function milisecondsToTimestamp(ms: number) {
    "0"
  )}:${seconds.padStart(2, "0")},${milliseconds}`;
 }
 export const MAGIC_TOKENS = ["Mrs.", "Ms.", "Mr.", "Dr.", "Prof.", "St."];
 export const END_OF_WORD_REGEX = /[^\.!,\?][\.!\?]/g;
 export const groupTranscription = (
  transcription: TranscriptionResultSegmentType[]
 ): TranscriptionResultSegmentGroupType[] => {
  const generateGroup = (group?: TranscriptionResultSegmentType[]) => {
    if (!group || group.length === 0) return;
    const firstWord = group[0];
    const lastWord = group[group.length - 1];
    return {
      offsets: {
        from: firstWord.offsets.from,
        to: lastWord.offsets.to,
      },
      text: group.map((w) => w.text.trim()).join(" "),
      timestamps: {
        from: firstWord.timestamps.from,
        to: lastWord.timestamps.to,
      },
      segments: group,
    };
  };
  const groups: TranscriptionResultSegmentGroupType[] = [];
  let group: TranscriptionResultSegmentType[] = [];
  transcription.forEach((segment) => {
    const text = segment.text.trim();
    if (!text) return;
    group.push(segment);
    if (
      !MAGIC_TOKENS.includes(text) &&
      segment.text.trim().match(END_OF_WORD_REGEX)
    ) {
      // Group a complete sentence;
      groups.push(generateGroup(group));
      // init a new group
      group = [];
    }
  });
  // Group the last group
  const lastSentence = generateGroup(group);
  if (lastSentence) groups.push(lastSentence);
  return groups;
 };