Feat: transcribe from web (#204)

* add transcribe from web * transcribe from web * add azure speech ai * fix azure speech output * may select stt service * fix UI * remove debug code * lint * fix default stt service * tweak * fix secondsToTimestamp
2024-01-27 00:45:06 +08:00
parent 99577c5020
commit cec9d73bc8
18 changed files with 404 additions and 81 deletions
--- a/enjoy/src/constants.ts
+++ b/enjoy/src/constants.ts
@@ -2,9 +2,10 @@ export const DATABASE_NAME = "enjoy_database";
 export const LIBRARY_PATH_SUFFIX = "EnjoyLibrary";

 export const STORAGE_WORKER_ENDPOINT = "https://enjoy-storage.baizhiheizi.com";
+export const AI_WORKER_ENDPOINT = "https://enjoy-ai.baizhiheizi.com";
 export const WEB_API_URL = "https://enjoy-web.fly.dev";

-export const REPO_URL = "https://github.com/xiaolai/everyone-can-use-english"
+export const REPO_URL = "https://github.com/xiaolai/everyone-can-use-english";

 // https://huggingface.co/ggerganov/whisper.cpp/tree/main
 export const WHISPER_MODELS_OPTIONS = [
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -156,7 +156,7 @@
  "autoCenter": "auto center",
  "inlineCaption": "inline caption",
  "autoScroll": "auto scroll",
-  "translate:": "translate",
+  "translate": "translate",
  "displayIpa": "display IPA",
  "detail": "detail",
  "remove": "remove",
@@ -295,7 +295,13 @@
  "advancedSettings": "Advanced settings",
  "advanced": "Advanced",
  "language": "Language",
-  "sttAiModel": "STT AI model",
+  "sttAiService": "STT AI service",
+  "local": "Local",
+  "localSpeechToTextDescription": "Use local whisper model to transcribe.",
+  "azureAi": "Azure AI",
+  "azureSpeechToTextDescription": "Use Azure AI Speech to transcribe.",
+  "cloudflareAi": "Cloudflare AI",
+  "cloudflareSpeechToTextDescription": "Use Cloudflare AI Worker to transcribe.",
  "checkingWhisper": "Checking whisper status",
  "pleaseDownloadWhisperModelFirst": "Please download whisper model first",
  "whisperIsWorkingGood": "Whisper is working good",
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -156,7 +156,7 @@
  "autoCenter": "自动居中",
  "inlineCaption": "内联字幕",
  "autoScroll": "自动滚动",
-  "translate:": "翻译",
+  "translate": "翻译",
  "displayIpa": "标注音标",
  "detail": "详情",
  "remove": "删除",
@@ -294,7 +294,13 @@
  "advancedSettingsShort": "高级设置",
  "advancedSettings": "高级设置",
  "language": "语言",
-  "sttAiModel": "语音转文本 AI 模型",
+  "sttAiService": "语音转文本服务",
+  "local": "本地",
+  "localSpeechToTextDescription": "使用本地 whisper 模型进行语音转文本",
+  "azureAi": "Azure AI",
+  "azureSpeechToTextDescription": "使用 Azure AI Speech 进行语音转文本",
+  "cloudflareAi": "Cloudflare AI",
+  "cloudflareSpeechToTextDescription": "使用 Cloudflare AI 进行语音转文本",
  "checkingWhisper": "正在检查 Whisper",
  "pleaseDownloadWhisperModelFirst": "请先下载 Whisper 模型",
  "whisperIsWorkingGood": "Whisper 正常工作",
--- a/enjoy/src/main/azure-speech-sdk.ts
+++ b/enjoy/src/main/azure-speech-sdk.ts
@@ -71,4 +71,58 @@ export class AzureSpeechSdk {
      });
    });
  }
+
+  async transcribe(params: {
+    filePath: string;
+    language?: string;
+  }): Promise<SpeechRecognitionResultType[]> {
+    const { filePath, language = "en-US" } = params;
+
+    const audioConfig = sdk.AudioConfig.fromWavFileInput(
+      fs.readFileSync(filePath)
+    );
+
+    // setting the recognition language to English.
+    this.config.speechRecognitionLanguage = language;
+    this.config.requestWordLevelTimestamps();
+    this.config.outputFormat = sdk.OutputFormat.Detailed;
+
+    // create the speech recognizer.
+    const reco = new sdk.SpeechRecognizer(this.config, audioConfig);
+
+    logger.debug("Start transcribe.");
+
+    let results: SpeechRecognitionResultType[] = [];
+    return new Promise((resolve, reject) => {
+      reco.recognizing = (_s, e) => {
+        logger.debug("Intermediate result received: ", e.result.text);
+      };
+      reco.recognized = (_s, e) => {
+        logger.debug("Got final result", e.result.text);
+        const json = e.result.properties.getProperty(
+          sdk.PropertyId.SpeechServiceResponse_JsonResult
+        );
+        const result = JSON.parse(json);
+        results = results.concat(result);
+      };
+      reco.canceled = (_s, e) => {
+        logger.debug("CANCELED: Reason=" + e.reason);
+
+        if (e.reason === sdk.CancellationReason.Error) {
+          logger.debug(`"CANCELED: ErrorCode=${e.errorCode}`);
+          logger.debug("CANCELED: ErrorDetails=" + e.errorDetails);
+          return reject(new Error(e.errorDetails));
+        }
+
+        reco.stopContinuousRecognitionAsync();
+      };
+      reco.sessionStopped = (_s, _e) => {
+        logger.debug("\n    Session stopped event.");
+        reco.stopContinuousRecognitionAsync();
+        return resolve(results);
+      };
+
+      reco.startContinuousRecognitionAsync();
+    });
+  }
 }
--- a/enjoy/src/main/db/models/transcription.ts
+++ b/enjoy/src/main/db/models/transcription.ts
@@ -148,7 +148,11 @@ export class Transcription extends Model<Transcription> {
      await this.update({
        state: "processing",
      });
-      const { model, transcription } = await whisper.transcribe(wavFile, {
+      const {
+        engine = "whisper",
+        model,
+        transcription,
+      } = await whisper.transcribe(wavFile, {
        force,
        extra: [
          "--split-on-word",
@@ -158,7 +162,7 @@ export class Transcription extends Model<Transcription> {
      });
      const result = whisper.groupTranscription(transcription);
      this.update({
-        engine: "whisper",
+        engine,
        model: model?.type,
        result,
        state: "finished",
--- a/enjoy/src/main/settings.ts
+++ b/enjoy/src/main/settings.ts
@@ -58,12 +58,29 @@ const dbPath = () => {
 };

 const whisperConfig = (): WhisperConfigType => {
+  const model = settings.getSync("whisper.model") as string;
+
+  let service = settings.getSync(
+    "whisper.service"
+  ) as WhisperConfigType["service"];
+
+  if (!service) {
+    if (model) {
+      settings.setSync("whisper.service", "local");
+      service = "local";
+    } else {
+      settings.setSync("whisper.service", "azure");
+      service = "azure";
+    }
+  }
+
  return {
+    service,
    availableModels: settings.getSync(
      "whisper.availableModels"
    ) as WhisperConfigType["availableModels"],
    modelsPath: settings.getSync("whisper.modelsPath") as string,
-    model: settings.getSync("whisper.model") as string,
+    model,
  };
 };

--- a/enjoy/src/main/whisper.ts
+++ b/enjoy/src/main/whisper.ts
@@ -1,13 +1,30 @@
 import { ipcMain } from "electron";
 import settings from "@main/settings";
 import path from "path";
-import { WHISPER_MODELS_OPTIONS, PROCESS_TIMEOUT } from "@/constants";
+import {
+  WHISPER_MODELS_OPTIONS,
+  PROCESS_TIMEOUT,
+  AI_WORKER_ENDPOINT,
+} from "@/constants";
 import { exec } from "child_process";
 import fs from "fs-extra";
 import log from "electron-log/main";
 import { t } from "i18next";
+import axios from "axios";
+import { milisecondsToTimestamp } from "@/utils";
+import { AzureSpeechSdk } from "@main/azure-speech-sdk";
+import { Client } from "@/api";
+import { WEB_API_URL } from "@/constants";
+import { sortedUniqBy, take } from "lodash";

 const logger = log.scope("whisper");
+
+const webApi = new Client({
+  baseUrl: process.env.WEB_API_URL || WEB_API_URL,
+  accessToken: settings.getSync("user.accessToken") as string,
+  logger: log.scope("api/client"),
+});
+
 const MAGIC_TOKENS = ["Mrs.", "Ms.", "Mr.", "Dr.", "Prof.", "St."];
 const END_OF_WORD_REGEX = /[^\.!,\?][\.!\?]/g;
 class Whipser {
@@ -135,7 +152,7 @@ class Whipser {
      group?: boolean;
    }
  ): Promise<
-    TranscriptionSegmentType[] | TranscriptionResultSegmentGroupType[]
+    TranscriptionResultSegmentType[] | TranscriptionResultSegmentGroupType[]
  > {
    const { prompt, group = false } = options || {};

@@ -164,17 +181,123 @@ class Whipser {
    }
  }

+  async transcribe(
+    file: string,
+    options?: {
+      force?: boolean;
+      extra?: string[];
+    }
+  ): Promise<Partial<WhisperOutputType>> {
+    if (this.config.service === "local") {
+      return this.transcribeFromLocal(file, options);
+    } else if (this.config.service === "azure") {
+      return this.transcribeFromAzure(file);
+    } else if (this.config.service === "cloudflare") {
+      return this.transcribeFromCloudflare(file);
+    } else {
+      throw new Error("Unknown service");
+    }
+  }
+
+  async transcribeFromAzure(file: string): Promise<Partial<WhisperOutputType>> {
+    const { token, region } = await webApi.generateSpeechToken();
+    const sdk = new AzureSpeechSdk(token, region);
+
+    const results = await sdk.transcribe({
+      filePath: file,
+    });
+
+    const transcription: TranscriptionResultSegmentType[] = [];
+    results.forEach((result) => {
+      logger.debug(result);
+      const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
+      const words = best.Display.trim().split(" ");
+
+      best.Words.map((word, index) => {
+        let text = word.Word;
+        if (words.length === best.Words.length) {
+          text = words[index];
+        }
+
+        if (
+          index === best.Words.length - 1 &&
+          !text.trim().match(END_OF_WORD_REGEX)
+        ) {
+          text = text + ".";
+        }
+
+        transcription.push({
+          offsets: {
+            from: word.Offset / 1e4,
+            to: (word.Offset + word.Duration) / 1e4,
+          },
+          timestamps: {
+            from: milisecondsToTimestamp(word.Offset / 1e4),
+            to: milisecondsToTimestamp((word.Offset + word.Duration) * 1e4),
+          },
+          text,
+        });
+      });
+    });
+
+    return {
+      engine: "azure",
+      model: {
+        type: "Azure AI Speech",
+      },
+      transcription,
+    };
+  }
+
+  async transcribeFromCloudflare(
+    file: string
+  ): Promise<Partial<WhisperOutputType>> {
+    logger.debug("transcribing from CloudFlare");
+
+    const data = fs.readFileSync(file);
+    const res: CfWhipserOutputType = (
+      await axios.postForm(`${AI_WORKER_ENDPOINT}/audio/transcriptions`, data)
+    ).data;
+    logger.debug("transcription from Web,", res);
+
+    const transcription: TranscriptionResultSegmentType[] = res.words.map(
+      (word) => {
+        return {
+          offsets: {
+            from: word.start * 1000,
+            to: word.end * 1000,
+          },
+          timestamps: {
+            from: milisecondsToTimestamp(word.start * 1000),
+            to: milisecondsToTimestamp(word.end * 1000),
+          },
+          text: word.word,
+        };
+      }
+    );
+    logger.debug("converted transcription,", transcription);
+
+    return {
+      engine: "cloudflare",
+      model: {
+        type: "@cf/openai/whisper",
+      },
+      transcription,
+    };
+  }
+
  /* Ensure the file is in wav format
   * and 16kHz sample rate
   */
-  async transcribe(
+  async transcribeFromLocal(
    file: string,
-    options: {
+    options?: {
      force?: boolean;
      extra?: string[];
-    } = {}
-  ) {
-    const { force = false, extra = [] } = options;
+    }
+  ): Promise<Partial<WhisperOutputType>> {
+    logger.debug("transcribing from local");
+    const { force = false, extra = [] } = options || {};
    const filename = path.basename(file, path.extname(file));
    const tmpDir = settings.cachePath();
    const outputFile = path.join(tmpDir, filename + ".json");
@@ -232,9 +355,9 @@ class Whipser {
  }

  groupTranscription(
-    transcription: TranscriptionSegmentType[]
+    transcription: TranscriptionResultSegmentType[]
  ): TranscriptionResultSegmentGroupType[] {
-    const generateGroup = (group?: TranscriptionSegmentType[]) => {
+    const generateGroup = (group?: TranscriptionResultSegmentType[]) => {
      if (!group || group.length === 0) return;

      const firstWord = group[0];
@@ -255,7 +378,7 @@ class Whipser {
    };

    const groups: TranscriptionResultSegmentGroupType[] = [];
-    let group: TranscriptionSegmentType[] = [];
+    let group: TranscriptionResultSegmentType[] = [];

    transcription.forEach((segment) => {
      const text = segment.text.trim();
@@ -310,6 +433,31 @@ class Whipser {
        });
    });

+    ipcMain.handle("whisper-set-service", async (event, service) => {
+      if (service === "local") {
+        try {
+          await this.initialize();
+          settings.setSync("whisper.service", service);
+          this.config.service = service;
+          return this.config;
+        } catch (err) {
+          event.sender.send("on-notification", {
+            type: "error",
+            message: err.message,
+          });
+        }
+      } else if (["cloudflare", "azure"].includes(service)) {
+        settings.setSync("whisper.service", service);
+        this.config.service = service;
+        return this.config;
+      } else {
+        event.sender.send("on-notification", {
+          type: "error",
+          message: "Unknown service",
+        });
+      }
+    });
+
    ipcMain.handle("whisper-check", async (_event) => {
      return await this.check();
    });
--- a/enjoy/src/preload.ts
+++ b/enjoy/src/preload.ts
@@ -326,6 +326,9 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
    setModel: (model: string) => {
      return ipcRenderer.invoke("whisper-set-model", model);
    },
+    setService: (service: string) => {
+      return ipcRenderer.invoke("whisper-set-service", service);
+    },
    check: () => {
      return ipcRenderer.invoke("whisper-check");
    },
--- a/enjoy/src/renderer/components/medias/media-caption.tsx
+++ b/enjoy/src/renderer/components/medias/media-caption.tsx
@@ -220,7 +220,11 @@ export const MediaCaption = (props: {
            </Button>
          </DropdownMenuTrigger>
          <DropdownMenuContent>
-            <DropdownMenuItem disabled={translating} onClick={translate}>
+            <DropdownMenuItem
+              className="cursor-pointer capitalize"
+              disabled={translating}
+              onClick={translate}
+            >
              {translating ? (
                <LoaderIcon className="w-4 h-4 mr-2 animate-spin" />
              ) : (
@@ -228,7 +232,11 @@ export const MediaCaption = (props: {
              )}
              <span>{t("translate")}</span>
            </DropdownMenuItem>
-            <DropdownMenuItem disabled={ipaGenerating} onClick={toogleIPA}>
+            <DropdownMenuItem
+              className="cursor-pointer capitalize"
+              disabled={ipaGenerating}
+              onClick={toogleIPA}
+            >
              {ipaGenerating ? (
                <LoaderIcon className="w-4 h-4 mr-2 animate-spin" />
              ) : (
--- a/enjoy/src/renderer/components/preferences/whisper-settings.tsx
+++ b/enjoy/src/renderer/components/preferences/whisper-settings.tsx
@@ -8,6 +8,11 @@ import {
  DialogDescription,
  DialogFooter,
  toast,
+  Select,
+  SelectTrigger,
+  SelectContent,
+  SelectItem,
+  SelectValue,
 } from "@renderer/components/ui";
 import { WhisperModelOptions } from "@renderer/components";
 import { AppSettingsProviderContext } from "@renderer/context";
@@ -15,9 +20,8 @@ import { useContext, useEffect, useState } from "react";
 import { InfoIcon, AlertCircleIcon } from "lucide-react";

 export const WhisperSettings = () => {
-  const { whisperConfig, refreshWhisperConfig, EnjoyApp } = useContext(
-    AppSettingsProviderContext
-  );
+  const { whisperConfig, refreshWhisperConfig, EnjoyApp, setWhisperService } =
+    useContext(AppSettingsProviderContext);
  const [stderr, setStderr] = useState("");

  useEffect(() => {
@@ -48,7 +52,7 @@ export const WhisperSettings = () => {
    <div className="flex items-start justify-between py-4">
      <div className="">
        <div className="flex items-center mb-2">
-          <span>{t("sttAiModel")}</span>
+          <span>{t("sttAiService")}</span>
          {stderr && (
            <Button
              variant="ghost"
@@ -62,49 +66,74 @@ export const WhisperSettings = () => {
          )}
        </div>
        <div className="text-sm text-muted-foreground">
-          {whisperConfig.model}
+          {whisperConfig?.service === "local" &&
+            t("localSpeechToTextDescription")}
+          {whisperConfig?.service === "azure" &&
+            t("azureSpeechToTextDescription")}
+          {whisperConfig?.service === "cloudflare" &&
+            t("cloudflareSpeechToTextDescription")}
        </div>
      </div>

      <div className="flex items-center space-x-2">
-        <Button onClick={handleCheck} variant="secondary" size="sm">
-          {t("check")}
-        </Button>
-        <Dialog>
-          <DialogTrigger asChild>
-            <Button variant="secondary" size="sm">
-              {t("edit")}
+        <Select
+          value={whisperConfig.service}
+          onValueChange={(value) => {
+            setWhisperService(value);
+          }}
+        >
+          <SelectTrigger className="min-w-fit">
+            <SelectValue placeholder="service"></SelectValue>
+          </SelectTrigger>
+          <SelectContent>
+            <SelectItem value="local">{t("local")}</SelectItem>
+            <SelectItem value="azure">{t("azureAi")}</SelectItem>
+            <SelectItem value="cloudflare">{t("cloudflareAi")}</SelectItem>
+          </SelectContent>
+        </Select>
+
+        {whisperConfig.service === "local" && (
+          <>
+            <Button onClick={handleCheck} variant="secondary" size="sm">
+              {t("check")}
            </Button>
-          </DialogTrigger>
-          <DialogContent>
-            <DialogHeader>{t("sttAiModel")}</DialogHeader>
-            <DialogDescription>
-              {t("chooseAIModelDependingOnYourHardware")}
-            </DialogDescription>
-
-            <WhisperModelOptions />
-
-            <DialogFooter>
-              <div className="text-xs flex items-start space-x-2">
-                <InfoIcon className="mr-1.5 w-4 h-4" />
-                <span className="flex-1 opacity-70">
-                  {t("yourModelsWillBeDownloadedTo", {
-                    path: whisperConfig.modelsPath,
-                  })}
-                </span>
-                <Button
-                  onClick={() => {
-                    EnjoyApp.shell.openPath(whisperConfig.modelsPath);
-                  }}
-                  variant="default"
-                  size="sm"
-                >
-                  {t("open")}
+            <Dialog>
+              <DialogTrigger asChild>
+                <Button variant="secondary" size="sm">
+                  {t("model")}
                </Button>
-              </div>
-            </DialogFooter>
-          </DialogContent>
-        </Dialog>
+              </DialogTrigger>
+              <DialogContent>
+                <DialogHeader>{t("sttAiService")}</DialogHeader>
+                <DialogDescription>
+                  {t("chooseAIModelDependingOnYourHardware")}
+                </DialogDescription>
+
+                <WhisperModelOptions />
+
+                <DialogFooter>
+                  <div className="text-xs flex items-start space-x-2">
+                    <InfoIcon className="mr-1.5 w-4 h-4" />
+                    <span className="flex-1 opacity-70">
+                      {t("yourModelsWillBeDownloadedTo", {
+                        path: whisperConfig.modelsPath,
+                      })}
+                    </span>
+                    <Button
+                      onClick={() => {
+                        EnjoyApp.shell.openPath(whisperConfig.modelsPath);
+                      }}
+                      variant="outline"
+                      size="sm"
+                    >
+                      {t("open")}
+                    </Button>
+                  </div>
+                </DialogFooter>
+              </DialogContent>
+            </Dialog>
+          </>
+        )}
      </div>
    </div>
  );
--- a/enjoy/src/renderer/components/ui/button.tsx
+++ b/enjoy/src/renderer/components/ui/button.tsx
@@ -5,7 +5,7 @@ import { cva, type VariantProps } from "class-variance-authority"
 import { cn } from "@renderer/lib/utils"

 const buttonVariants = cva(
-  "capitalize inline-flex items-center justify-center rounded-md text-sm font-medium transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50",
+  "capitalize inline-flex items-center justify-center rounded-md text-sm font-medium transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50 min-w-fit",
  {
    variants: {
      variant: {
--- a/enjoy/src/renderer/components/whisper-model-options.tsx
+++ b/enjoy/src/renderer/components/whisper-model-options.tsx
@@ -68,7 +68,7 @@ export const WhisperModelOptionsPanel = () => {
              onClick={() => {
                EnjoyApp.shell.openPath(whisperConfig.modelsPath);
              }}
-              variant="default"
+              variant="outline"
              size="sm"
            >
              {t("open")}
--- a/enjoy/src/renderer/context/app-settings-provider.tsx
+++ b/enjoy/src/renderer/context/app-settings-provider.tsx
@@ -16,6 +16,7 @@ type AppSettingsProviderState = {
  logout?: () => void;
  setLibraryPath?: (path: string) => Promise<void>;
  setWhisperModel?: (name: string) => Promise<void>;
+  setWhisperService?: (name: string) => Promise<void>;
  ffmpegConfig?: FfmpegConfigType;
  ffmpeg?: FFmpeg;
  whisperConfig?: WhisperConfigType;
@@ -190,6 +191,13 @@ export const AppSettingsProvider = ({
    });
  };

+  const setWhisperService = async (name: WhisperConfigType["service"]) => {
+    return EnjoyApp.whisper.setService(name).then((config) => {
+      if (!config) return;
+      setWhisperConfig(config);
+    });
+  };
+
  const validate = async () => {
    setInitialized(Boolean(user && libraryPath));
  };
@@ -208,6 +216,7 @@ export const AppSettingsProvider = ({
        libraryPath,
        setLibraryPath: setLibraryPathHandler,
        setWhisperModel,
+        setWhisperService,
        ffmpegConfig,
        ffmpeg,
        whisperConfig,
--- a/enjoy/src/renderer/lib/utils.ts
+++ b/enjoy/src/renderer/lib/utils.ts
@@ -16,8 +16,11 @@ export function cn(...inputs: ClassValue[]) {
 }

 export function secondsToTimestamp(seconds: number) {
-  const date = new Date(seconds * 1000);
-  return date.toISOString().substr(11, 8);
+  const h = Math.floor(seconds / 3600).toString();
+  const m = Math.floor((seconds % 3600) / 60).toString();
+  const s = Math.floor((seconds % 3600) % 60).toString();
+
+  return `${h.padStart(2, "0")}:${m.padStart(2, "0")}:${s.padStart(2, "0")}`;
 }

 export function humanizeDuration(
--- a/enjoy/src/types/enjoy-app.d.ts
+++ b/enjoy/src/types/enjoy-app.d.ts
@@ -186,6 +186,9 @@ type EnjoyAppType = {
    config: () => Promise<WhisperConfigType>;
    check: () => Promise<{ success: boolean; log: string }>;
    setModel: (model: string) => Promise<WhisperConfigType>;
+    setService: (
+      service: WhisperConfigType["service"]
+    ) => Promise<WhisperConfigType>;
    transcribeBlob: (
      blob: { type: string; arrayBuffer: ArrayBuffer },
      prompt?: string
--- a/enjoy/src/types/index.d.ts
+++ b/enjoy/src/types/index.d.ts
@@ -26,6 +26,7 @@ type NotificationType = {
 };

 type WhisperConfigType = {
+  service: "local" | "azure" | "cloudflare";
  availableModels: {
    type: string;
    name: string;
@@ -39,24 +40,25 @@ type WhisperConfigType = {
 };

 type WhisperOutputType = {
+  engine?: string;
  model: {
-    audio: {
+    audio?: {
      cts: number;
      head: number;
      layer: number;
      state: number;
    };
-    ftype: number;
-    mels: number;
-    multilingual: number;
-    text: {
+    ftype?: number;
+    mels?: number;
+    multilingual?: number;
+    text?: {
      cts: number;
      head: number;
      layer: number;
      state: number;
    };
    type: string;
-    vocab: number;
+    vocab?: number;
  };
  params: {
    language: string;
@@ -67,19 +69,17 @@ type WhisperOutputType = {
    languate: string;
  };
  systeminfo: string;
-  transcription: TranscriptionSegmentType[];
+  transcription: TranscriptionResultSegmentType[];
 };

-type TranscriptionSegmentType = {
-  offsets: {
-    from: number;
-    to: number;
-  };
+type CfWhipserOutputType = {
  text: string;
-  timestamps: {
-    from: string;
-    to: string;
-  };
+  words_count: number;
+  words: {
+    word: string;
+    start: number;
+    end: number;
+  }[];
 };

 type TransactionStateType = {
--- a/enjoy/src/types/pronunciation-assessment.d.ts
+++ b/enjoy/src/types/pronunciation-assessment.d.ts
@@ -62,3 +62,24 @@ type PronunciationAssessmentWordResultType = {
    };
  };
 };
+
+type SpeechRecognitionResultType = {
+  Id: string;
+  RecognitionStatus: string;
+  Offset: number;
+  Duration: number;
+  Channel: number;
+  DisplayText: string;
+  NBest: {
+    Confidence: number;
+    Lexical: string;
+    ITN: string;
+    MaskedITN: string;
+    Display: string;
+    Words: {
+      Word: string;
+      Offset: number;
+      Duration: number;
+    }[];
+  }[];
+};
--- a/enjoy/src/utils.ts
+++ b/enjoy/src/utils.ts
@@ -66,3 +66,14 @@ export function generatePitch(peaks: Float32Array, sampleRate: number) {

  return { frequencies, baseFrequency };
 }
+
+export function milisecondsToTimestamp(ms: number) {
+  const hours = Math.floor(ms / 3600000).toString();
+  const minutes = Math.floor((ms % 3600000) / 60000).toString();
+  const seconds = Math.floor(((ms % 360000) % 60000) / 1000).toString();
+  const milliseconds = Math.floor(((ms % 360000) % 60000) % 1000).toString();
+  return `${hours.padStart(2, "0")}:${minutes.padStart(
+    2,
+    "0"
+  )}:${seconds.padStart(2, "0")},${milliseconds}`;
+}