diff --git a/enjoy/src/main/db/models/recording.ts b/enjoy/src/main/db/models/recording.ts index 2494c23c..d55161ea 100644 --- a/enjoy/src/main/db/models/recording.ts +++ b/enjoy/src/main/db/models/recording.ts @@ -172,7 +172,11 @@ export class Recording extends Model { logger, }); - const { token, region } = await webApi.generateSpeechToken({ + const { + id: tokenId, + token, + region, + } = await webApi.generateSpeechToken({ targetId: this.id, targetType: "Recording", }); @@ -191,6 +195,7 @@ export class Recording extends Model { } ); resultJson.duration = this.duration; + resultJson.tokenId = tokenId; const _pronunciationAssessment = await PronunciationAssessment.create( { diff --git a/enjoy/src/main/db/models/transcription.ts b/enjoy/src/main/db/models/transcription.ts index becc3794..cb807f32 100644 --- a/enjoy/src/main/db/models/transcription.ts +++ b/enjoy/src/main/db/models/transcription.ts @@ -56,7 +56,10 @@ export class Transcription extends Model { model: string; @Column(DataType.JSON) - result: Partial & { originalText?: string }; + result: Partial & { + originalText?: string; + tokenId?: string | number; + }; @Column(DataType.DATE) syncedAt: Date; diff --git a/enjoy/src/renderer/hooks/index.ts b/enjoy/src/renderer/hooks/index.ts index 1364ea36..414c3ccb 100644 --- a/enjoy/src/renderer/hooks/index.ts +++ b/enjoy/src/renderer/hooks/index.ts @@ -4,6 +4,7 @@ export * from "./use-camdict"; export * from "./use-conversation"; export * from "./use-notes"; export * from "./use-recordings"; +export * from "./use-pronunciation-assessments"; export * from "./use-segments"; export * from "./use-transcribe"; export * from "./use-transcriptions"; diff --git a/enjoy/src/renderer/hooks/use-pronunciation-assessments.tsx b/enjoy/src/renderer/hooks/use-pronunciation-assessments.tsx new file mode 100644 index 00000000..706c6efa --- /dev/null +++ b/enjoy/src/renderer/hooks/use-pronunciation-assessments.tsx @@ -0,0 +1,79 @@ +import * as sdk from "microsoft-cognitiveservices-speech-sdk"; +import { useContext } from "react"; +import { AppSettingsProviderContext } from "@renderer/context"; + +export const usePronunciationAssessments = () => { + const { webApi } = useContext(AppSettingsProviderContext); + + const assess = async ( + params: { + blob: Blob; + language: string; + reference?: string; + }, + options?: { + targetId?: string; + targetType?: string; + } + ) => { + const { blob, language, reference } = params; + const { id, token, region } = await webApi.generateSpeechToken(options); + const config = sdk.SpeechConfig.fromAuthorizationToken(token, region); + const audioConfig = sdk.AudioConfig.fromWavFileInput( + new File([blob], "audio.wav") + ); + + const pronunciationAssessmentConfig = new sdk.PronunciationAssessmentConfig( + reference, + sdk.PronunciationAssessmentGradingSystem.HundredMark, + sdk.PronunciationAssessmentGranularity.Phoneme, + true + ); + pronunciationAssessmentConfig.phonemeAlphabet = "IPA"; + + // setting the recognition language + config.speechRecognitionLanguage = language; + + // create the speech recognizer. + const reco = new sdk.SpeechRecognizer(config, audioConfig); + pronunciationAssessmentConfig.applyTo(reco); + + return new Promise((resolve, reject) => { + reco.recognizeOnceAsync((result) => { + reco.close(); + + switch (result.reason) { + case sdk.ResultReason.RecognizedSpeech: + const pronunciationResult = + sdk.PronunciationAssessmentResult.fromResult(result); + console.debug( + "Received pronunciation assessment result.", + pronunciationResult.detailResult + ); + resolve(pronunciationResult); + break; + case sdk.ResultReason.NoMatch: + reject(new Error("No speech could be recognized.")); + break; + case sdk.ResultReason.Canceled: + const cancellationDetails = + sdk.CancellationDetails.fromResult(result); + console.debug( + "CANCELED: Reason=" + + cancellationDetails.reason + + " ErrorDetails=" + + cancellationDetails.errorDetails + ); + reject(new Error(cancellationDetails.errorDetails)); + break; + default: + reject(result); + } + }); + }); + }; + + return { + assess, + }; +}; diff --git a/enjoy/src/renderer/hooks/use-transcribe.tsx b/enjoy/src/renderer/hooks/use-transcribe.tsx index 3b9faaab..79d9e515 100644 --- a/enjoy/src/renderer/hooks/use-transcribe.tsx +++ b/enjoy/src/renderer/hooks/use-transcribe.tsx @@ -42,6 +42,7 @@ export const useTranscribe = () => { model: string; alignmentResult: AlignmentResult; originalText?: string; + tokenId?: number; }> => { const url = await transcode(mediaSrc); const { targetId, targetType, originalText, language, service } = @@ -173,8 +174,9 @@ export const useTranscribe = () => { engine: string; model: string; text: string; + tokenId: number; }> => { - const { token, region } = await webApi.generateSpeechToken(params); + const { id, token, region } = await webApi.generateSpeechToken(params); const config = sdk.SpeechConfig.fromAuthorizationToken(token, region); const audioConfig = sdk.AudioConfig.fromWavFileInput( new File([blob], "audio.wav") @@ -217,6 +219,7 @@ export const useTranscribe = () => { engine: "azure", model: "whisper", text: results.map((result) => result.DisplayText).join(" "), + tokenId: id, }); }; diff --git a/enjoy/src/renderer/hooks/use-transcriptions.tsx b/enjoy/src/renderer/hooks/use-transcriptions.tsx index 54bb3a93..b5db427d 100644 --- a/enjoy/src/renderer/hooks/use-transcriptions.tsx +++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx @@ -78,13 +78,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => { setTranscribing(true); setTranscribingProgress(0); try { - const { engine, model, alignmentResult } = await transcribe(media.src, { - targetId: media.id, - targetType: media.mediaType, - originalText, - language, - service, - }); + const { engine, model, alignmentResult, tokenId } = await transcribe( + media.src, + { + targetId: media.id, + targetType: media.mediaType, + originalText, + language, + service, + } + ); let timeline: TimelineEntry[] = []; alignmentResult.timeline.forEach((t) => { @@ -174,6 +177,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => { timeline: timeline, transcript: alignmentResult.transcript, originalText, + tokenId, }, engine, model,