Force alignment for tts audio (#418)
* add originalText as param * save original text when added from tts speech * fix player in conversation sheet * minor fix
This commit is contained in:
@@ -72,6 +72,7 @@ class AudiosHandler {
|
||||
params: {
|
||||
name?: string;
|
||||
coverUrl?: string;
|
||||
originalText?: string;
|
||||
} = {}
|
||||
) {
|
||||
let file = uri;
|
||||
@@ -95,19 +96,33 @@ class AudiosHandler {
|
||||
}
|
||||
}
|
||||
|
||||
return Audio.buildFromLocalFile(file, {
|
||||
source,
|
||||
...params,
|
||||
})
|
||||
.then((audio) => {
|
||||
return audio.toJSON();
|
||||
})
|
||||
.catch((err) => {
|
||||
return event.sender.send("on-notification", {
|
||||
type: "error",
|
||||
message: t("models.audio.failedToAdd", { error: err.message }),
|
||||
});
|
||||
try {
|
||||
const audio = await Audio.buildFromLocalFile(file, {
|
||||
source,
|
||||
name: params.name,
|
||||
coverUrl: params.coverUrl,
|
||||
});
|
||||
|
||||
// create transcription if originalText is provided
|
||||
const { originalText } = params;
|
||||
if (originalText) {
|
||||
await Transcription.create({
|
||||
targetType: "Audio",
|
||||
targetId: audio.id,
|
||||
targetMd5: audio.md5,
|
||||
result: {
|
||||
originalText,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
return audio.toJSON();
|
||||
} catch (err) {
|
||||
return event.sender.send("on-notification", {
|
||||
type: "error",
|
||||
message: t("models.audio.failedToAdd", { error: err.message }),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
private async update(
|
||||
|
||||
@@ -229,6 +229,12 @@ export class Audio extends Model<Audio> {
|
||||
targetType: "Audio",
|
||||
},
|
||||
});
|
||||
Transcription.destroy({
|
||||
where: {
|
||||
targetId: audio.id,
|
||||
targetType: "Audio",
|
||||
},
|
||||
});
|
||||
|
||||
const webApi = new Client({
|
||||
baseUrl: process.env.WEB_API_URL || WEB_API_URL,
|
||||
|
||||
@@ -26,6 +26,10 @@ export const AudioPlayer = (props: { id?: string; md5?: string }) => {
|
||||
|
||||
useEffect(() => {
|
||||
setRef(ref);
|
||||
|
||||
return () => {
|
||||
setRef(null);
|
||||
};
|
||||
}, [ref]);
|
||||
|
||||
return (
|
||||
|
||||
@@ -72,6 +72,7 @@ export const MediaCurrentRecording = (props: { height?: number }) => {
|
||||
|
||||
const removeComparingPitchContour = () => {
|
||||
if (!wavesurfer) return;
|
||||
if (!regions) return;
|
||||
|
||||
regions
|
||||
.getRegions()
|
||||
|
||||
@@ -72,7 +72,7 @@ export const MediaTranscription = () => {
|
||||
} as ScrollIntoViewOptions);
|
||||
}, [currentSegmentIndex, transcription, containerRef]);
|
||||
|
||||
if (!transcription?.result) {
|
||||
if (!transcription?.result?.timeline) {
|
||||
return null;
|
||||
}
|
||||
|
||||
|
||||
@@ -104,6 +104,7 @@ export const AssistantMessageComponent = (props: {
|
||||
speech.text.length > 20
|
||||
? speech.text.substring(0, 17).trim() + "..."
|
||||
: speech.text,
|
||||
originalText: speech.text,
|
||||
});
|
||||
setResourcing(false);
|
||||
}
|
||||
@@ -251,7 +252,7 @@ export const AssistantMessageComponent = (props: {
|
||||
</SheetClose>
|
||||
</SheetHeader>
|
||||
|
||||
{Boolean(speech) && <AudioPlayer md5={speech.md5} />}
|
||||
{Boolean(speech) && shadowing && <AudioPlayer md5={speech.md5} />}
|
||||
</SheetContent>
|
||||
</Sheet>
|
||||
</div>
|
||||
|
||||
@@ -117,7 +117,7 @@ export const MediaPlayerProvider = ({
|
||||
const initializeWavesurfer = async () => {
|
||||
if (!media) return;
|
||||
if (!mediaProvider) return;
|
||||
if (!ref.current) return;
|
||||
if (!ref?.current) return;
|
||||
|
||||
const ws = WaveSurfer.create({
|
||||
container: ref.current,
|
||||
@@ -299,22 +299,6 @@ export const MediaPlayerProvider = ({
|
||||
);
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
if (!media) return;
|
||||
|
||||
EnjoyApp.waveforms.find(media.md5).then((waveform) => {
|
||||
setWaveForm(waveform);
|
||||
});
|
||||
}, [media]);
|
||||
|
||||
/*
|
||||
* Initialize wavesurfer when container ref is available
|
||||
* and mediaProvider is available
|
||||
*/
|
||||
useEffect(() => {
|
||||
initializeWavesurfer();
|
||||
}, [media, ref, mediaProvider]);
|
||||
|
||||
/*
|
||||
* When wavesurfer is decoded,
|
||||
* set up event listeners for wavesurfer
|
||||
@@ -353,6 +337,7 @@ export const MediaPlayerProvider = ({
|
||||
|
||||
return () => {
|
||||
subscriptions.forEach((unsub) => unsub());
|
||||
wavesurfer?.destroy();
|
||||
};
|
||||
}, [wavesurfer]);
|
||||
|
||||
@@ -372,6 +357,10 @@ export const MediaPlayerProvider = ({
|
||||
} else if (activeRegion.id.startsWith("word-region")) {
|
||||
setFitZoomRatio(containerWidth / 3 / duration / minPxPerSec);
|
||||
}
|
||||
|
||||
return () => {
|
||||
setFitZoomRatio(1.0);
|
||||
}
|
||||
}, [ref, wavesurfer, activeRegion]);
|
||||
|
||||
/*
|
||||
@@ -395,7 +384,7 @@ export const MediaPlayerProvider = ({
|
||||
if (!activeRegion) return;
|
||||
|
||||
renderPitchContour(activeRegion);
|
||||
}, [activeRegion]);
|
||||
}, [wavesurfer, activeRegion]);
|
||||
|
||||
/*
|
||||
* Update player styles
|
||||
@@ -408,6 +397,22 @@ export const MediaPlayerProvider = ({
|
||||
scrollContainer.style.scrollbarWidth = "thin";
|
||||
}, [decoded, wavesurfer]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!media) return;
|
||||
|
||||
EnjoyApp.waveforms.find(media.md5).then((waveform) => {
|
||||
setWaveForm(waveform);
|
||||
});
|
||||
}, [media]);
|
||||
|
||||
/*
|
||||
* Initialize wavesurfer when container ref is available
|
||||
* and mediaProvider is available
|
||||
*/
|
||||
useEffect(() => {
|
||||
initializeWavesurfer();
|
||||
}, [media, ref, mediaProvider]);
|
||||
|
||||
return (
|
||||
<MediaPlayerProviderContext.Provider
|
||||
value={{
|
||||
|
||||
@@ -79,34 +79,43 @@ export const useTranscribe = () => {
|
||||
params?: {
|
||||
targetId?: string;
|
||||
targetType?: string;
|
||||
originalText?: string;
|
||||
}
|
||||
): Promise<{
|
||||
engine: string;
|
||||
model: string;
|
||||
alignmentResult: AlignmentResult;
|
||||
originalText?: string;
|
||||
}> => {
|
||||
const blob = await transcode(mediaSrc);
|
||||
const { targetId, targetType, originalText } = params || {};
|
||||
|
||||
let result;
|
||||
if (whisperConfig.service === "local") {
|
||||
if (originalText) {
|
||||
result = {
|
||||
engine: "original",
|
||||
model: "original",
|
||||
};
|
||||
} else if (whisperConfig.service === "local") {
|
||||
result = await transcribeByLocal(blob);
|
||||
} else if (whisperConfig.service === "cloudflare") {
|
||||
result = await transcribeByCloudflareAi(blob);
|
||||
} else if (whisperConfig.service === "openai") {
|
||||
result = await transcribeByOpenAi(blob);
|
||||
} else if (whisperConfig.service === "azure") {
|
||||
result = await transcribeByAzureAi(blob, params);
|
||||
result = await transcribeByAzureAi(blob, { targetId, targetType });
|
||||
} else {
|
||||
throw new Error(t("whisperServiceNotSupported"));
|
||||
}
|
||||
|
||||
const alignmentResult = await EnjoyApp.echogarden.align(
|
||||
new Uint8Array(await blob.arrayBuffer()),
|
||||
result.result.map((segment) => segment.text).join(" ")
|
||||
originalText || result.result.map((segment) => segment.text).join(" ")
|
||||
);
|
||||
|
||||
return {
|
||||
...result,
|
||||
originalText,
|
||||
alignmentResult,
|
||||
};
|
||||
};
|
||||
|
||||
@@ -29,30 +29,41 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
setTranscription(record);
|
||||
}
|
||||
};
|
||||
const findOrCreateTranscription = async () => {
|
||||
if (!media) return;
|
||||
if (transcription) return;
|
||||
const findOrCreateTranscription =
|
||||
async (): Promise<TranscriptionType | void> => {
|
||||
if (!media) return;
|
||||
if (transcription?.targetId === media.id) return;
|
||||
|
||||
return EnjoyApp.transcriptions
|
||||
.findOrCreate({
|
||||
targetId: media.id,
|
||||
targetType: media.mediaType,
|
||||
})
|
||||
.then((t) => {
|
||||
if (t.result && !t.result["transcript"]) {
|
||||
t.result = null;
|
||||
}
|
||||
setTranscription(t);
|
||||
})
|
||||
.catch((err) => {
|
||||
toast.error(err.message);
|
||||
});
|
||||
};
|
||||
return EnjoyApp.transcriptions
|
||||
.findOrCreate({
|
||||
targetId: media.id,
|
||||
targetType: media.mediaType,
|
||||
})
|
||||
.then((t) => {
|
||||
if (t.result && !t.result["timeline"]) {
|
||||
t.result = {
|
||||
originalText: t.result?.originalText,
|
||||
};
|
||||
}
|
||||
setTranscription(t);
|
||||
return t;
|
||||
})
|
||||
.catch((err) => {
|
||||
toast.error(err.message);
|
||||
});
|
||||
};
|
||||
|
||||
const generateTranscription = async () => {
|
||||
if (transcribing) return;
|
||||
if (!transcription) {
|
||||
await findOrCreateTranscription();
|
||||
if (transcription?.targetId === media.id) return;
|
||||
|
||||
let originalText: string;
|
||||
if (transcription) {
|
||||
originalText = transcription.result?.originalText;
|
||||
} else {
|
||||
const r = await findOrCreateTranscription();
|
||||
if (r) {
|
||||
originalText = r.result?.originalText;
|
||||
}
|
||||
}
|
||||
|
||||
setTranscribing(true);
|
||||
@@ -61,6 +72,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
const { engine, model, alignmentResult } = await transcribe(media.src, {
|
||||
targetId: media.id,
|
||||
targetType: media.mediaType,
|
||||
originalText,
|
||||
});
|
||||
|
||||
let timeline: TimelineEntry[] = [];
|
||||
@@ -105,6 +117,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
result: {
|
||||
timeline: timeline,
|
||||
transcript: alignmentResult.transcript,
|
||||
originalText,
|
||||
},
|
||||
engine,
|
||||
model,
|
||||
@@ -126,14 +139,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
});
|
||||
|
||||
const transcript = (res?.transcriptions || []).filter((t) =>
|
||||
["base", "small", "medium", "large", "whisper-1"].includes(t.model)
|
||||
["base", "small", "medium", "large", "whisper-1", "original"].includes(
|
||||
t.model
|
||||
)
|
||||
)?.[0];
|
||||
|
||||
if (!transcript) {
|
||||
return Promise.reject("Transcription not found");
|
||||
}
|
||||
|
||||
if (!transcript.result["transcript"]) {
|
||||
if (!transcript.result["timeline"]) {
|
||||
return Promise.reject("Transcription not aligned");
|
||||
}
|
||||
|
||||
@@ -149,17 +164,23 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
try {
|
||||
await findTranscriptionFromWebApi();
|
||||
} catch (err) {
|
||||
console.error(err);
|
||||
console.warn(err);
|
||||
await generateTranscription();
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* find or create transcription
|
||||
*/
|
||||
useEffect(() => {
|
||||
if (!media) return;
|
||||
|
||||
findOrCreateTranscription();
|
||||
}, [media]);
|
||||
|
||||
/*
|
||||
* auto-generate transcription result
|
||||
*/
|
||||
useEffect(() => {
|
||||
if (!transcription) return;
|
||||
|
||||
@@ -167,7 +188,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
|
||||
if (
|
||||
transcription.state == "pending" ||
|
||||
!transcription.result?.["transcript"]
|
||||
!transcription.result?.["timeline"]
|
||||
) {
|
||||
findOrGenerateTranscription();
|
||||
}
|
||||
|
||||
2
enjoy/src/types/transcription.d.ts
vendored
2
enjoy/src/types/transcription.d.ts
vendored
@@ -5,7 +5,7 @@ type TranscriptionType = {
|
||||
state: "pending" | "processing" | "finished";
|
||||
engine: string;
|
||||
model: string;
|
||||
result: AlignmentResult;
|
||||
result: AlignmentResult & { original?: string };
|
||||
};
|
||||
|
||||
type TranscriptionResultSegmentType = {
|
||||
|
||||
Reference in New Issue
Block a user