Refactor transcription (#476)

* word-level timestamp is not needed for alignment

* remove deprecated code

* fix error when stop recording
This commit is contained in:
an-lee
2024-04-02 14:03:02 +08:00
committed by GitHub
parent f4d1d2a730
commit f0f4319044
4 changed files with 13 additions and 147 deletions

View File

@@ -188,9 +188,6 @@ class Whipser {
"--output-file",
path.join(tmpDir, filename),
"-pp",
"--split-on-word",
"--max-len",
"1",
...extra,
];

View File

@@ -18,6 +18,7 @@ export const MediaRecorder = () => {
transcription,
currentSegmentIndex,
} = useContext(MediaPlayerProviderContext);
const [player, setPlayer] = useState<WaveSurfer>();
const [access, setAccess] = useState<boolean>(false);
const [duration, setDuration] = useState<number>(0);
const { EnjoyApp } = useContext(AppSettingsProviderContext);
@@ -80,6 +81,7 @@ export const MediaRecorder = () => {
autoCenter: false,
normalize: false,
});
setPlayer(ws);
const record = ws.registerPlugin(RecordPlugin.create());
let startAt = 0;
@@ -113,9 +115,9 @@ export const MediaRecorder = () => {
});
return () => {
clearInterval(interval);
record.stopRecording();
ws?.destroy();
if (interval) clearInterval(interval);
record?.stopRecording();
player?.destroy();
};
}, [ref, isRecording, access, layout?.playerHeight]);

View File

@@ -8,10 +8,6 @@ import { t } from "i18next";
import { AI_WORKER_ENDPOINT } from "@/constants";
import * as sdk from "microsoft-cognitiveservices-speech-sdk";
import axios from "axios";
import take from "lodash/take";
import sortedUniqBy from "lodash/sortedUniqBy";
import { groupTranscription, milisecondsToTimestamp } from "@/utils";
import { END_OF_SENTENCE_REGEX } from "@/constants";
import { AlignmentResult } from "echogarden/dist/api/API.d.js";
export const useTranscribe = () => {
@@ -67,7 +63,7 @@ export const useTranscribe = () => {
const alignmentResult = await EnjoyApp.echogarden.align(
new Uint8Array(await blob.arrayBuffer()),
originalText || result.result.map((segment) => segment.text).join(" ")
originalText || result.text
);
return {
@@ -88,12 +84,10 @@ export const useTranscribe = () => {
}
);
const result = groupTranscription(res.transcription);
return {
engine: "whisper",
model: res.model.type,
result,
text: res.transcription.map((segment) => segment.text).join(" "),
};
};
@@ -108,41 +102,16 @@ export const useTranscribe = () => {
dangerouslyAllowBrowser: true,
});
const res: {
words: {
word: string;
start: number;
end: number;
}[];
} = (await client.audio.transcriptions.create({
const res: { text: string } = (await client.audio.transcriptions.create({
file: new File([blob], "audio.wav"),
model: "whisper-1",
response_format: "verbose_json",
timestamp_granularities: ["word"],
response_format: "json",
})) as any;
const transcription: TranscriptionResultSegmentType[] = res.words.map(
(word) => {
return {
offsets: {
from: word.start * 1000,
to: word.end * 1000,
},
timestamps: {
from: milisecondsToTimestamp(word.start * 1000),
to: milisecondsToTimestamp(word.end * 1000),
},
text: word.word,
};
}
);
const result = groupTranscription(transcription);
return {
engine: "openai",
model: "whisper-1",
result,
text: res.text,
};
};
@@ -155,28 +124,11 @@ export const useTranscribe = () => {
timeout: 1000 * 60 * 5,
})
).data;
const transcription: TranscriptionResultSegmentType[] = res.words.map(
(word) => {
return {
offsets: {
from: word.start * 1000,
to: word.end * 1000,
},
timestamps: {
from: milisecondsToTimestamp(word.start * 1000),
to: milisecondsToTimestamp(word.end * 1000),
},
text: word.word,
};
}
);
const result = groupTranscription(transcription);
return {
engine: "cloudflare",
model: "@cf/openai/whisper",
result,
text: res.text,
};
};
@@ -189,7 +141,7 @@ export const useTranscribe = () => {
): Promise<{
engine: string;
model: string;
result: TranscriptionResultSegmentGroupType[];
text: string;
}> => {
const { token, region } = await webApi.generateSpeechToken(params);
const config = sdk.SpeechConfig.fromAuthorizationToken(token, region);
@@ -230,43 +182,10 @@ export const useTranscribe = () => {
reco.sessionStopped = (_s, _e) => {
reco.stopContinuousRecognitionAsync();
const transcription: TranscriptionResultSegmentType[] = [];
results.forEach((result) => {
const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
const words = best.Display.trim().split(" ");
best.Words.map((word, index) => {
let text = word.Word;
if (words.length === best.Words.length) {
text = words[index];
}
if (
index === best.Words.length - 1 &&
!text.trim().match(END_OF_SENTENCE_REGEX)
) {
text = text + ".";
}
transcription.push({
offsets: {
from: word.Offset / 1e4,
to: (word.Offset + word.Duration) / 1e4,
},
timestamps: {
from: milisecondsToTimestamp(word.Offset / 1e4),
to: milisecondsToTimestamp((word.Offset + word.Duration) * 1e4),
},
text,
});
});
});
resolve({
engine: "azure",
model: "whisper",
result: groupTranscription(transcription),
text: results.map((result) => result.DisplayText).join(' '),
});
};

View File

@@ -1,5 +1,4 @@
import Pitchfinder from "pitchfinder";
import { END_OF_SENTENCE_REGEX, MAGIC_TOKEN_REGEX } from "./constants";
import { IPA_MAPPING } from "./constants";
export const extractFrequencies = (props: {
@@ -34,57 +33,6 @@ export function milisecondsToTimestamp(ms: number) {
)}:${seconds.padStart(2, "0")},${milliseconds}`;
}
export const groupTranscription = (
transcription: TranscriptionResultSegmentType[]
): TranscriptionResultSegmentGroupType[] => {
const generateGroup = (group?: TranscriptionResultSegmentType[]) => {
if (!group || group.length === 0) return;
const firstWord = group[0];
const lastWord = group[group.length - 1];
return {
offsets: {
from: firstWord.offsets.from,
to: lastWord.offsets.to,
},
text: group.map((w) => w.text.trim()).join(" "),
timestamps: {
from: firstWord.timestamps.from,
to: lastWord.timestamps.to,
},
segments: group,
};
};
const groups: TranscriptionResultSegmentGroupType[] = [];
let group: TranscriptionResultSegmentType[] = [];
transcription.forEach((segment) => {
const text = segment.text.trim();
if (!text) return;
group.push(segment);
if (
!text.match(MAGIC_TOKEN_REGEX) &&
segment.text.trim().match(END_OF_SENTENCE_REGEX)
) {
// Group a complete sentence;
groups.push(generateGroup(group));
// init a new group
group = [];
}
});
// Group the last group
const lastSentence = generateGroup(group);
if (lastSentence) groups.push(lastSentence);
return groups;
};
export const convertIpaToNormal = (ipa: string) => {
const mark = ipa.match(/(\ˈ|ˌ)/);
const cleanIpa = ipa.replace(mark ? mark[0] : "", "");