Refactor azure config (#729)

* save token id * save token id in assessment result * add use-pronunciation-assessment hook
2024-06-27 18:02:16 +08:00
parent 54872bb449
commit 63fa482363
6 changed files with 105 additions and 10 deletions
--- a/enjoy/src/main/db/models/recording.ts
+++ b/enjoy/src/main/db/models/recording.ts
@@ -172,7 +172,11 @@ export class Recording extends Model<Recording> {
      logger,
    });

-    const { token, region } = await webApi.generateSpeechToken({
+    const {
+      id: tokenId,
+      token,
+      region,
+    } = await webApi.generateSpeechToken({
      targetId: this.id,
      targetType: "Recording",
    });
@@ -191,6 +195,7 @@ export class Recording extends Model<Recording> {
      }
    );
    resultJson.duration = this.duration;
+    resultJson.tokenId = tokenId;

    const _pronunciationAssessment = await PronunciationAssessment.create(
      {
--- a/enjoy/src/main/db/models/transcription.ts
+++ b/enjoy/src/main/db/models/transcription.ts
@@ -56,7 +56,10 @@ export class Transcription extends Model<Transcription> {
  model: string;

  @Column(DataType.JSON)
-  result: Partial<AlignmentResult> & { originalText?: string };
+  result: Partial<AlignmentResult> & {
+    originalText?: string;
+    tokenId?: string | number;
+  };

  @Column(DataType.DATE)
  syncedAt: Date;
--- a/enjoy/src/renderer/hooks/index.ts
+++ b/enjoy/src/renderer/hooks/index.ts
@@ -4,6 +4,7 @@ export * from "./use-camdict";
 export * from "./use-conversation";
 export * from "./use-notes";
 export * from "./use-recordings";
+export * from "./use-pronunciation-assessments";
 export * from "./use-segments";
 export * from "./use-transcribe";
 export * from "./use-transcriptions";
--- a/enjoy/src/renderer/hooks/use-pronunciation-assessments.tsx
+++ b/enjoy/src/renderer/hooks/use-pronunciation-assessments.tsx
@@ -0,0 +1,79 @@
+import * as sdk from "microsoft-cognitiveservices-speech-sdk";
+import { useContext } from "react";
+import { AppSettingsProviderContext } from "@renderer/context";
+
+export const usePronunciationAssessments = () => {
+  const { webApi } = useContext(AppSettingsProviderContext);
+
+  const assess = async (
+    params: {
+      blob: Blob;
+      language: string;
+      reference?: string;
+    },
+    options?: {
+      targetId?: string;
+      targetType?: string;
+    }
+  ) => {
+    const { blob, language, reference } = params;
+    const { id, token, region } = await webApi.generateSpeechToken(options);
+    const config = sdk.SpeechConfig.fromAuthorizationToken(token, region);
+    const audioConfig = sdk.AudioConfig.fromWavFileInput(
+      new File([blob], "audio.wav")
+    );
+
+    const pronunciationAssessmentConfig = new sdk.PronunciationAssessmentConfig(
+      reference,
+      sdk.PronunciationAssessmentGradingSystem.HundredMark,
+      sdk.PronunciationAssessmentGranularity.Phoneme,
+      true
+    );
+    pronunciationAssessmentConfig.phonemeAlphabet = "IPA";
+
+    // setting the recognition language
+    config.speechRecognitionLanguage = language;
+
+    // create the speech recognizer.
+    const reco = new sdk.SpeechRecognizer(config, audioConfig);
+    pronunciationAssessmentConfig.applyTo(reco);
+
+    return new Promise((resolve, reject) => {
+      reco.recognizeOnceAsync((result) => {
+        reco.close();
+
+        switch (result.reason) {
+          case sdk.ResultReason.RecognizedSpeech:
+            const pronunciationResult =
+              sdk.PronunciationAssessmentResult.fromResult(result);
+            console.debug(
+              "Received pronunciation assessment result.",
+              pronunciationResult.detailResult
+            );
+            resolve(pronunciationResult);
+            break;
+          case sdk.ResultReason.NoMatch:
+            reject(new Error("No speech could be recognized."));
+            break;
+          case sdk.ResultReason.Canceled:
+            const cancellationDetails =
+              sdk.CancellationDetails.fromResult(result);
+            console.debug(
+              "CANCELED: Reason=" +
+                cancellationDetails.reason +
+                " ErrorDetails=" +
+                cancellationDetails.errorDetails
+            );
+            reject(new Error(cancellationDetails.errorDetails));
+            break;
+          default:
+            reject(result);
+        }
+      });
+    });
+  };
+
+  return {
+    assess,
+  };
+};
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -42,6 +42,7 @@ export const useTranscribe = () => {
    model: string;
    alignmentResult: AlignmentResult;
    originalText?: string;
+    tokenId?: number;
  }> => {
    const url = await transcode(mediaSrc);
    const { targetId, targetType, originalText, language, service } =
@@ -173,8 +174,9 @@ export const useTranscribe = () => {
    engine: string;
    model: string;
    text: string;
+    tokenId: number;
  }> => {
-    const { token, region } = await webApi.generateSpeechToken(params);
+    const { id, token, region } = await webApi.generateSpeechToken(params);
    const config = sdk.SpeechConfig.fromAuthorizationToken(token, region);
    const audioConfig = sdk.AudioConfig.fromWavFileInput(
      new File([blob], "audio.wav")
@@ -217,6 +219,7 @@ export const useTranscribe = () => {
          engine: "azure",
          model: "whisper",
          text: results.map((result) => result.DisplayText).join(" "),
+          tokenId: id,
        });
      };

--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -78,13 +78,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
    setTranscribing(true);
    setTranscribingProgress(0);
    try {
-      const { engine, model, alignmentResult } = await transcribe(media.src, {
-        targetId: media.id,
-        targetType: media.mediaType,
-        originalText,
-        language,
-        service,
-      });
+      const { engine, model, alignmentResult, tokenId } = await transcribe(
+        media.src,
+        {
+          targetId: media.id,
+          targetType: media.mediaType,
+          originalText,
+          language,
+          service,
+        }
+      );

      let timeline: TimelineEntry[] = [];
      alignmentResult.timeline.forEach((t) => {
@@ -174,6 +177,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
          timeline: timeline,
          transcript: alignmentResult.transcript,
          originalText,
+          tokenId,
        },
        engine,
        model,