Refactor transcription (#476)
* word-level timestamp is not needed for alignment * remove deprecated code * fix error when stop recording
This commit is contained in:
@@ -188,9 +188,6 @@ class Whipser {
|
||||
"--output-file",
|
||||
path.join(tmpDir, filename),
|
||||
"-pp",
|
||||
"--split-on-word",
|
||||
"--max-len",
|
||||
"1",
|
||||
...extra,
|
||||
];
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@ export const MediaRecorder = () => {
|
||||
transcription,
|
||||
currentSegmentIndex,
|
||||
} = useContext(MediaPlayerProviderContext);
|
||||
const [player, setPlayer] = useState<WaveSurfer>();
|
||||
const [access, setAccess] = useState<boolean>(false);
|
||||
const [duration, setDuration] = useState<number>(0);
|
||||
const { EnjoyApp } = useContext(AppSettingsProviderContext);
|
||||
@@ -80,6 +81,7 @@ export const MediaRecorder = () => {
|
||||
autoCenter: false,
|
||||
normalize: false,
|
||||
});
|
||||
setPlayer(ws);
|
||||
|
||||
const record = ws.registerPlugin(RecordPlugin.create());
|
||||
let startAt = 0;
|
||||
@@ -113,9 +115,9 @@ export const MediaRecorder = () => {
|
||||
});
|
||||
|
||||
return () => {
|
||||
clearInterval(interval);
|
||||
record.stopRecording();
|
||||
ws?.destroy();
|
||||
if (interval) clearInterval(interval);
|
||||
record?.stopRecording();
|
||||
player?.destroy();
|
||||
};
|
||||
}, [ref, isRecording, access, layout?.playerHeight]);
|
||||
|
||||
|
||||
@@ -8,10 +8,6 @@ import { t } from "i18next";
|
||||
import { AI_WORKER_ENDPOINT } from "@/constants";
|
||||
import * as sdk from "microsoft-cognitiveservices-speech-sdk";
|
||||
import axios from "axios";
|
||||
import take from "lodash/take";
|
||||
import sortedUniqBy from "lodash/sortedUniqBy";
|
||||
import { groupTranscription, milisecondsToTimestamp } from "@/utils";
|
||||
import { END_OF_SENTENCE_REGEX } from "@/constants";
|
||||
import { AlignmentResult } from "echogarden/dist/api/API.d.js";
|
||||
|
||||
export const useTranscribe = () => {
|
||||
@@ -67,7 +63,7 @@ export const useTranscribe = () => {
|
||||
|
||||
const alignmentResult = await EnjoyApp.echogarden.align(
|
||||
new Uint8Array(await blob.arrayBuffer()),
|
||||
originalText || result.result.map((segment) => segment.text).join(" ")
|
||||
originalText || result.text
|
||||
);
|
||||
|
||||
return {
|
||||
@@ -88,12 +84,10 @@ export const useTranscribe = () => {
|
||||
}
|
||||
);
|
||||
|
||||
const result = groupTranscription(res.transcription);
|
||||
|
||||
return {
|
||||
engine: "whisper",
|
||||
model: res.model.type,
|
||||
result,
|
||||
text: res.transcription.map((segment) => segment.text).join(" "),
|
||||
};
|
||||
};
|
||||
|
||||
@@ -108,41 +102,16 @@ export const useTranscribe = () => {
|
||||
dangerouslyAllowBrowser: true,
|
||||
});
|
||||
|
||||
const res: {
|
||||
words: {
|
||||
word: string;
|
||||
start: number;
|
||||
end: number;
|
||||
}[];
|
||||
} = (await client.audio.transcriptions.create({
|
||||
const res: { text: string } = (await client.audio.transcriptions.create({
|
||||
file: new File([blob], "audio.wav"),
|
||||
model: "whisper-1",
|
||||
response_format: "verbose_json",
|
||||
timestamp_granularities: ["word"],
|
||||
response_format: "json",
|
||||
})) as any;
|
||||
|
||||
const transcription: TranscriptionResultSegmentType[] = res.words.map(
|
||||
(word) => {
|
||||
return {
|
||||
offsets: {
|
||||
from: word.start * 1000,
|
||||
to: word.end * 1000,
|
||||
},
|
||||
timestamps: {
|
||||
from: milisecondsToTimestamp(word.start * 1000),
|
||||
to: milisecondsToTimestamp(word.end * 1000),
|
||||
},
|
||||
text: word.word,
|
||||
};
|
||||
}
|
||||
);
|
||||
|
||||
const result = groupTranscription(transcription);
|
||||
|
||||
return {
|
||||
engine: "openai",
|
||||
model: "whisper-1",
|
||||
result,
|
||||
text: res.text,
|
||||
};
|
||||
};
|
||||
|
||||
@@ -155,28 +124,11 @@ export const useTranscribe = () => {
|
||||
timeout: 1000 * 60 * 5,
|
||||
})
|
||||
).data;
|
||||
const transcription: TranscriptionResultSegmentType[] = res.words.map(
|
||||
(word) => {
|
||||
return {
|
||||
offsets: {
|
||||
from: word.start * 1000,
|
||||
to: word.end * 1000,
|
||||
},
|
||||
timestamps: {
|
||||
from: milisecondsToTimestamp(word.start * 1000),
|
||||
to: milisecondsToTimestamp(word.end * 1000),
|
||||
},
|
||||
text: word.word,
|
||||
};
|
||||
}
|
||||
);
|
||||
|
||||
const result = groupTranscription(transcription);
|
||||
|
||||
return {
|
||||
engine: "cloudflare",
|
||||
model: "@cf/openai/whisper",
|
||||
result,
|
||||
text: res.text,
|
||||
};
|
||||
};
|
||||
|
||||
@@ -189,7 +141,7 @@ export const useTranscribe = () => {
|
||||
): Promise<{
|
||||
engine: string;
|
||||
model: string;
|
||||
result: TranscriptionResultSegmentGroupType[];
|
||||
text: string;
|
||||
}> => {
|
||||
const { token, region } = await webApi.generateSpeechToken(params);
|
||||
const config = sdk.SpeechConfig.fromAuthorizationToken(token, region);
|
||||
@@ -230,43 +182,10 @@ export const useTranscribe = () => {
|
||||
reco.sessionStopped = (_s, _e) => {
|
||||
reco.stopContinuousRecognitionAsync();
|
||||
|
||||
const transcription: TranscriptionResultSegmentType[] = [];
|
||||
|
||||
results.forEach((result) => {
|
||||
const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
|
||||
const words = best.Display.trim().split(" ");
|
||||
|
||||
best.Words.map((word, index) => {
|
||||
let text = word.Word;
|
||||
if (words.length === best.Words.length) {
|
||||
text = words[index];
|
||||
}
|
||||
|
||||
if (
|
||||
index === best.Words.length - 1 &&
|
||||
!text.trim().match(END_OF_SENTENCE_REGEX)
|
||||
) {
|
||||
text = text + ".";
|
||||
}
|
||||
|
||||
transcription.push({
|
||||
offsets: {
|
||||
from: word.Offset / 1e4,
|
||||
to: (word.Offset + word.Duration) / 1e4,
|
||||
},
|
||||
timestamps: {
|
||||
from: milisecondsToTimestamp(word.Offset / 1e4),
|
||||
to: milisecondsToTimestamp((word.Offset + word.Duration) * 1e4),
|
||||
},
|
||||
text,
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
resolve({
|
||||
engine: "azure",
|
||||
model: "whisper",
|
||||
result: groupTranscription(transcription),
|
||||
text: results.map((result) => result.DisplayText).join(' '),
|
||||
});
|
||||
};
|
||||
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import Pitchfinder from "pitchfinder";
|
||||
import { END_OF_SENTENCE_REGEX, MAGIC_TOKEN_REGEX } from "./constants";
|
||||
import { IPA_MAPPING } from "./constants";
|
||||
|
||||
export const extractFrequencies = (props: {
|
||||
@@ -34,57 +33,6 @@ export function milisecondsToTimestamp(ms: number) {
|
||||
)}:${seconds.padStart(2, "0")},${milliseconds}`;
|
||||
}
|
||||
|
||||
export const groupTranscription = (
|
||||
transcription: TranscriptionResultSegmentType[]
|
||||
): TranscriptionResultSegmentGroupType[] => {
|
||||
const generateGroup = (group?: TranscriptionResultSegmentType[]) => {
|
||||
if (!group || group.length === 0) return;
|
||||
|
||||
const firstWord = group[0];
|
||||
const lastWord = group[group.length - 1];
|
||||
|
||||
return {
|
||||
offsets: {
|
||||
from: firstWord.offsets.from,
|
||||
to: lastWord.offsets.to,
|
||||
},
|
||||
text: group.map((w) => w.text.trim()).join(" "),
|
||||
timestamps: {
|
||||
from: firstWord.timestamps.from,
|
||||
to: lastWord.timestamps.to,
|
||||
},
|
||||
segments: group,
|
||||
};
|
||||
};
|
||||
|
||||
const groups: TranscriptionResultSegmentGroupType[] = [];
|
||||
let group: TranscriptionResultSegmentType[] = [];
|
||||
|
||||
transcription.forEach((segment) => {
|
||||
const text = segment.text.trim();
|
||||
if (!text) return;
|
||||
|
||||
group.push(segment);
|
||||
|
||||
if (
|
||||
!text.match(MAGIC_TOKEN_REGEX) &&
|
||||
segment.text.trim().match(END_OF_SENTENCE_REGEX)
|
||||
) {
|
||||
// Group a complete sentence;
|
||||
groups.push(generateGroup(group));
|
||||
|
||||
// init a new group
|
||||
group = [];
|
||||
}
|
||||
});
|
||||
|
||||
// Group the last group
|
||||
const lastSentence = generateGroup(group);
|
||||
if (lastSentence) groups.push(lastSentence);
|
||||
|
||||
return groups;
|
||||
};
|
||||
|
||||
export const convertIpaToNormal = (ipa: string) => {
|
||||
const mark = ipa.match(/(\ˈ|ˌ)/);
|
||||
const cleanIpa = ipa.replace(mark ? mark[0] : "", "");
|
||||
|
||||
Reference in New Issue
Block a user