diff --git a/enjoy/src/main/whisper.ts b/enjoy/src/main/whisper.ts index 94974541..f38f8767 100644 --- a/enjoy/src/main/whisper.ts +++ b/enjoy/src/main/whisper.ts @@ -188,9 +188,6 @@ class Whipser { "--output-file", path.join(tmpDir, filename), "-pp", - "--split-on-word", - "--max-len", - "1", ...extra, ]; diff --git a/enjoy/src/renderer/components/medias/media-recorder.tsx b/enjoy/src/renderer/components/medias/media-recorder.tsx index 83d45f68..f2759f62 100644 --- a/enjoy/src/renderer/components/medias/media-recorder.tsx +++ b/enjoy/src/renderer/components/medias/media-recorder.tsx @@ -18,6 +18,7 @@ export const MediaRecorder = () => { transcription, currentSegmentIndex, } = useContext(MediaPlayerProviderContext); + const [player, setPlayer] = useState(); const [access, setAccess] = useState(false); const [duration, setDuration] = useState(0); const { EnjoyApp } = useContext(AppSettingsProviderContext); @@ -80,6 +81,7 @@ export const MediaRecorder = () => { autoCenter: false, normalize: false, }); + setPlayer(ws); const record = ws.registerPlugin(RecordPlugin.create()); let startAt = 0; @@ -113,9 +115,9 @@ export const MediaRecorder = () => { }); return () => { - clearInterval(interval); - record.stopRecording(); - ws?.destroy(); + if (interval) clearInterval(interval); + record?.stopRecording(); + player?.destroy(); }; }, [ref, isRecording, access, layout?.playerHeight]); diff --git a/enjoy/src/renderer/hooks/use-transcribe.tsx b/enjoy/src/renderer/hooks/use-transcribe.tsx index cffde52f..669f8466 100644 --- a/enjoy/src/renderer/hooks/use-transcribe.tsx +++ b/enjoy/src/renderer/hooks/use-transcribe.tsx @@ -8,10 +8,6 @@ import { t } from "i18next"; import { AI_WORKER_ENDPOINT } from "@/constants"; import * as sdk from "microsoft-cognitiveservices-speech-sdk"; import axios from "axios"; -import take from "lodash/take"; -import sortedUniqBy from "lodash/sortedUniqBy"; -import { groupTranscription, milisecondsToTimestamp } from "@/utils"; -import { END_OF_SENTENCE_REGEX } from "@/constants"; import { AlignmentResult } from "echogarden/dist/api/API.d.js"; export const useTranscribe = () => { @@ -67,7 +63,7 @@ export const useTranscribe = () => { const alignmentResult = await EnjoyApp.echogarden.align( new Uint8Array(await blob.arrayBuffer()), - originalText || result.result.map((segment) => segment.text).join(" ") + originalText || result.text ); return { @@ -88,12 +84,10 @@ export const useTranscribe = () => { } ); - const result = groupTranscription(res.transcription); - return { engine: "whisper", model: res.model.type, - result, + text: res.transcription.map((segment) => segment.text).join(" "), }; }; @@ -108,41 +102,16 @@ export const useTranscribe = () => { dangerouslyAllowBrowser: true, }); - const res: { - words: { - word: string; - start: number; - end: number; - }[]; - } = (await client.audio.transcriptions.create({ + const res: { text: string } = (await client.audio.transcriptions.create({ file: new File([blob], "audio.wav"), model: "whisper-1", - response_format: "verbose_json", - timestamp_granularities: ["word"], + response_format: "json", })) as any; - const transcription: TranscriptionResultSegmentType[] = res.words.map( - (word) => { - return { - offsets: { - from: word.start * 1000, - to: word.end * 1000, - }, - timestamps: { - from: milisecondsToTimestamp(word.start * 1000), - to: milisecondsToTimestamp(word.end * 1000), - }, - text: word.word, - }; - } - ); - - const result = groupTranscription(transcription); - return { engine: "openai", model: "whisper-1", - result, + text: res.text, }; }; @@ -155,28 +124,11 @@ export const useTranscribe = () => { timeout: 1000 * 60 * 5, }) ).data; - const transcription: TranscriptionResultSegmentType[] = res.words.map( - (word) => { - return { - offsets: { - from: word.start * 1000, - to: word.end * 1000, - }, - timestamps: { - from: milisecondsToTimestamp(word.start * 1000), - to: milisecondsToTimestamp(word.end * 1000), - }, - text: word.word, - }; - } - ); - - const result = groupTranscription(transcription); return { engine: "cloudflare", model: "@cf/openai/whisper", - result, + text: res.text, }; }; @@ -189,7 +141,7 @@ export const useTranscribe = () => { ): Promise<{ engine: string; model: string; - result: TranscriptionResultSegmentGroupType[]; + text: string; }> => { const { token, region } = await webApi.generateSpeechToken(params); const config = sdk.SpeechConfig.fromAuthorizationToken(token, region); @@ -230,43 +182,10 @@ export const useTranscribe = () => { reco.sessionStopped = (_s, _e) => { reco.stopContinuousRecognitionAsync(); - const transcription: TranscriptionResultSegmentType[] = []; - - results.forEach((result) => { - const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0]; - const words = best.Display.trim().split(" "); - - best.Words.map((word, index) => { - let text = word.Word; - if (words.length === best.Words.length) { - text = words[index]; - } - - if ( - index === best.Words.length - 1 && - !text.trim().match(END_OF_SENTENCE_REGEX) - ) { - text = text + "."; - } - - transcription.push({ - offsets: { - from: word.Offset / 1e4, - to: (word.Offset + word.Duration) / 1e4, - }, - timestamps: { - from: milisecondsToTimestamp(word.Offset / 1e4), - to: milisecondsToTimestamp((word.Offset + word.Duration) * 1e4), - }, - text, - }); - }); - }); - resolve({ engine: "azure", model: "whisper", - result: groupTranscription(transcription), + text: results.map((result) => result.DisplayText).join(' '), }); }; diff --git a/enjoy/src/utils.ts b/enjoy/src/utils.ts index 3306a388..36c16319 100644 --- a/enjoy/src/utils.ts +++ b/enjoy/src/utils.ts @@ -1,5 +1,4 @@ import Pitchfinder from "pitchfinder"; -import { END_OF_SENTENCE_REGEX, MAGIC_TOKEN_REGEX } from "./constants"; import { IPA_MAPPING } from "./constants"; export const extractFrequencies = (props: { @@ -34,57 +33,6 @@ export function milisecondsToTimestamp(ms: number) { )}:${seconds.padStart(2, "0")},${milliseconds}`; } -export const groupTranscription = ( - transcription: TranscriptionResultSegmentType[] -): TranscriptionResultSegmentGroupType[] => { - const generateGroup = (group?: TranscriptionResultSegmentType[]) => { - if (!group || group.length === 0) return; - - const firstWord = group[0]; - const lastWord = group[group.length - 1]; - - return { - offsets: { - from: firstWord.offsets.from, - to: lastWord.offsets.to, - }, - text: group.map((w) => w.text.trim()).join(" "), - timestamps: { - from: firstWord.timestamps.from, - to: lastWord.timestamps.to, - }, - segments: group, - }; - }; - - const groups: TranscriptionResultSegmentGroupType[] = []; - let group: TranscriptionResultSegmentType[] = []; - - transcription.forEach((segment) => { - const text = segment.text.trim(); - if (!text) return; - - group.push(segment); - - if ( - !text.match(MAGIC_TOKEN_REGEX) && - segment.text.trim().match(END_OF_SENTENCE_REGEX) - ) { - // Group a complete sentence; - groups.push(generateGroup(group)); - - // init a new group - group = []; - } - }); - - // Group the last group - const lastSentence = generateGroup(group); - if (lastSentence) groups.push(lastSentence); - - return groups; -}; - export const convertIpaToNormal = (ipa: string) => { const mark = ipa.match(/(\ˈ|ˌ)/); const cleanIpa = ipa.replace(mark ? mark[0] : "", "");