Improve transcription (#1182)

* handle special character in transcription when assess

* may play original pronounciation in assessment result
This commit is contained in:
an-lee
2024-11-15 15:09:07 +08:00
committed by GitHub
parent 1f531b0cbc
commit b8167a99d8
11 changed files with 172 additions and 103 deletions

View File

@@ -927,5 +927,7 @@
"openaiTtsServiceDescription": "Use OpenAI TTS service from your own key.",
"enjoyTtsServiceDescription": "Use TTS service provided by Enjoy. OpenAI or Azure is supported.",
"compressMediaBeforeAdding": "Compress media before adding",
"keepOriginalMedia": "Keep original media"
"keepOriginalMedia": "Keep original media",
"myPronunciation": "My pronunciation",
"originalPronunciation": "Original pronunciation"
}

View File

@@ -927,5 +927,7 @@
"openaiTtsServiceDescription": "使用您自己的 API key 来使用 OpenAI TTS 服务。",
"enjoyTtsServiceDescription": "使用 Enjoy 提供的 TTS 服务,支持 OpenAI 或 Azure。",
"compressMediaBeforeAdding": "添加前压缩媒体",
"keepOriginalMedia": "保存原始媒体"
"keepOriginalMedia": "保存原始媒体",
"myPronunciation": "我的发音",
"originalPronunciation": "原始发音"
}

View File

@@ -1,4 +1,4 @@
import { useEffect, useContext, useRef, useState } from "react";
import { useEffect, useContext, useRef, useState, useMemo } from "react";
import {
AppSettingsProviderContext,
HotKeysSettingsProviderContext,
@@ -50,17 +50,13 @@ import { formatDuration } from "@renderer/lib/utils";
import { useHotkeys } from "react-hotkeys-hook";
import { LiveAudioVisualizer } from "react-audio-visualize";
import debounce from "lodash/debounce";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
const ACTION_BUTTON_HEIGHT = 35;
export const MediaCurrentRecording = () => {
const {
isRecording,
isPaused,
cancelRecording,
togglePauseResume,
stopRecording,
recordingTime,
mediaRecorder,
currentRecording,
renderPitchContour: renderMediaPitchContour,
regions: mediaRegions,
@@ -71,6 +67,8 @@ export const MediaCurrentRecording = () => {
currentSegment,
createSegment,
currentTime: mediaCurrentTime,
caption,
toggleRegion,
} = useContext(MediaShadowProviderContext);
const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext);
const { currentHotkeys } = useContext(HotKeysSettingsProviderContext);
@@ -263,6 +261,23 @@ export const MediaCurrentRecording = () => {
});
};
const playWord = (word: string, index: number) => {
const candidates = caption.timeline.filter(
(w: TimelineEntry) => w.text.toLowerCase() === word.toLowerCase()
);
const target = candidates[index];
if (!target) return;
const wordIndex = caption.timeline.findIndex(
(w) => w.startTime === target.startTime
);
toggleRegion([wordIndex]);
setTimeout(() => {
wavesurfer?.playPause();
}, 250);
};
const calContainerSize = () => {
const size = ref?.current
?.closest(".media-recording-wrapper")
@@ -685,7 +700,12 @@ export const MediaCurrentRecording = () => {
</SheetClose>
</SheetHeader>
<RecordingDetail recording={currentRecording} />
<RecordingDetail
recording={currentRecording}
onPlayOrigin={(word: string, index: number = 0) =>
playWord(word, index)
}
/>
</SheetContent>
</Sheet>
</div>
@@ -745,7 +765,6 @@ const MediaRecorder = () => {
const {
mediaRecorder,
recordingTime,
isRecording,
isPaused,
cancelRecording,
togglePauseResume,

View File

@@ -318,7 +318,7 @@ export const MediaPlayerControls = () => {
wavesurfer.pause();
setTimeout(() => {
activeRegionDebouncePlay();
}, 500);
}, 250);
} else if (playMode === "single") {
wavesurfer.pause();
}

View File

@@ -34,7 +34,13 @@ export const MediaCaption = (props: {
const [notedquoteIndices, setNotedquoteIndices] = useState<number[]>([]);
let words = caption.text.split(" ");
let words = caption.text
.replace(/ ([.,!?:;])/g, "$1")
.replace(/ (['"")])/g, "$1")
.replace(/ \.\.\./g, "...")
.split(/([—]|\s+)/g)
.filter((word) => word.trim() !== "" && word !== "—");
const ipas = caption.timeline.map((w) =>
w.timeline?.map((t) =>
t.timeline && language.startsWith("en")

View File

@@ -1,4 +1,4 @@
import { useEffect, useState, useContext, useRef } from "react";
import { useEffect, useState, useContext, useRef, useMemo } from "react";
import { MediaShadowProviderContext } from "@renderer/context";
import cloneDeep from "lodash/cloneDeep";
import {
@@ -11,10 +11,6 @@ import {
} from "@renderer/components/ui";
import { MediaCaption, MediaCaptionActions } from "@renderer/components";
import { t } from "i18next";
import {
Timeline,
TimelineEntry,
} from "echogarden/dist/utilities/Timeline.d.js";
import {
MediaCaptionAnalysis,
MediaCaptionNote,
@@ -29,12 +25,14 @@ export const MediaRightPanel = (props: {
}) => {
const { className, setDisplayPanel } = props;
const {
caption,
currentSegmentIndex,
currentTime,
transcription,
regions,
activeRegion,
setActiveRegion,
toggleRegion,
editingRegion,
setEditingRegion,
setTranscriptionDraft,
@@ -47,7 +45,6 @@ export const MediaRightPanel = (props: {
const [displayIpa, setDisplayIpa] = useState<boolean>(true);
const [displayNotes, setDisplayNotes] = useState<boolean>(true);
const [caption, setCaption] = useState<TimelineEntry | null>(null);
const [tab, setTab] = useState<string>("translation");
const toggleMultiSelect = (event: KeyboardEvent) => {
@@ -79,67 +76,6 @@ export const MediaRightPanel = (props: {
}
};
const toggleRegion = (params: number[]) => {
if (!activeRegion) return;
if (editingRegion) {
toast.warning(t("currentRegionIsBeingEdited"));
return;
}
if (params.length === 0) {
if (activeRegion.id.startsWith("word-region")) {
activeRegion.remove();
setActiveRegion(
regions.getRegions().find((r) => r.id.startsWith("segment-region"))
);
}
return;
}
const startIndex = Math.min(...params);
const endIndex = Math.max(...params);
const startWord = caption.timeline[startIndex];
if (!startWord) return;
const endWord = caption.timeline[endIndex] || startWord;
const start = startWord.startTime;
const end = endWord.endTime;
// If the active region is a word region, then merge the selected words into a single region.
if (activeRegion.id.startsWith("word-region")) {
activeRegion.remove();
const region = regions.addRegion({
id: `word-region-${startIndex}`,
start,
end,
color: "#fb6f9233",
drag: false,
resize: editingRegion,
});
setActiveRegion(region);
// If the active region is a meaning group region, then active the segment region.
} else if (activeRegion.id.startsWith("meaning-group-region")) {
setActiveRegion(
regions.getRegions().find((r) => r.id.startsWith("segment-region"))
);
// If the active region is a segment region, then create a new word region.
} else {
const region = regions.addRegion({
id: `word-region-${startIndex}`,
start,
end,
color: "#fb6f9233",
drag: false,
resize: false,
});
setActiveRegion(region);
}
};
useEffect(() => {
if (!caption) return;
@@ -160,6 +96,7 @@ export const MediaRightPanel = (props: {
toggleRegion(selectedIndices);
}, [caption, selectedIndices]);
// Edit region to update transcription draft
useEffect(() => {
if (!activeRegion) return;
if (!activeRegion.id.startsWith("word-region")) return;
@@ -234,12 +171,6 @@ export const MediaRightPanel = (props: {
};
}, [editingRegion]);
useEffect(() => {
setCaption(
(transcription?.result?.timeline as Timeline)?.[currentSegmentIndex]
);
}, [currentSegmentIndex, transcription]);
useEffect(() => {
return () => setSelectedIndices([]);
}, [caption]);

View File

@@ -4,16 +4,13 @@ import {
MediaRightPanel,
MediaLeftPanel,
MediaBottomPanel,
MediaProvider,
} from "@renderer/components";
import {
Button,
ResizableHandle,
ResizablePanel,
ResizablePanelGroup,
} from "@renderer/components/ui";
import { useContext, useState } from "react";
import { RefreshCcwDotIcon } from "lucide-react";
export const MediaShadowPlayer = () => {
return (

View File

@@ -8,8 +8,9 @@ export const PronunciationAssessmentFulltextResult = (props: {
words: PronunciationAssessmentWordResultType[];
currentTime?: number;
src?: string;
onPlayOrigin?: (word: string, index: number) => void;
}) => {
const { words, currentTime, src } = props;
const { words, currentTime, src, onPlayOrigin } = props;
const [errorStats, setErrorStats] = useState({
mispronunciation: 0,
omission: 0,
@@ -65,6 +66,16 @@ export const PronunciationAssessmentFulltextResult = (props: {
errorDisplay={errorDisplay}
currentTime={currentTime}
src={src}
onPlayOrigin={() => {
if (!onPlayOrigin) return;
const word = words[index];
const candidates = words.filter((w) => w.word === word.word);
const wordIndex = candidates.findIndex(
(w) => w.offset === word.offset
);
onPlayOrigin(word.word, wordIndex);
}}
/>
))}
</div>

View File

@@ -20,6 +20,7 @@ export const PronunciationAssessmentWordResult = (props: {
monotone: boolean;
};
currentTime?: number;
onPlayOrigin?: () => void;
}) => {
const {
result,
@@ -32,6 +33,7 @@ export const PronunciationAssessmentWordResult = (props: {
monotone: true,
},
currentTime = 0,
onPlayOrigin,
} = props;
const audio = useRef<HTMLAudioElement>(null);
@@ -71,25 +73,41 @@ export const PronunciationAssessmentWordResult = (props: {
}[result.pronunciationAssessment.errorType];
const play = () => {
const { offset, duration } = result;
if (!audio.current || !props.src) return;
// create a new audio element and play the segment
audio.current.src = `${props.src}#t=${(offset * 1.0) / 1e7},${
((offset + duration) * 1.0) / 1e7
}`;
const { offset, duration } = result;
if (!offset || !duration) return;
const startTime = (offset * 1.0) / 1e7;
const endTime = ((offset + duration) * 1.0) / 1e7;
audio.current.currentTime = startTime;
// Add timeupdate listener to stop at the end of the segment
const handleTimeUpdate = () => {
if (audio.current.currentTime >= endTime) {
audio.current.pause();
audio.current.removeEventListener("timeupdate", handleTimeUpdate);
}
};
audio.current.addEventListener("timeupdate", handleTimeUpdate);
audio.current.play();
};
useEffect(() => {
if (!audio.current) {
audio.current = new Audio();
audio.current = new Audio(props.src);
}
return () => {
audio.current?.pause();
delete audio.current;
if (audio.current) {
audio.current.pause();
audio.current.removeEventListener("timeupdate", () => {});
audio.current = null;
}
};
}, []);
}, [props.src]);
return (
<Popover>
@@ -152,11 +170,20 @@ export const PronunciationAssessmentWordResult = (props: {
</div>
)}
<div className="">
<div className="flex items-center space-x-2">
<span className="text-sm">{t("myPronunciation")}:</span>
<Button onClick={play} variant="ghost" size="icon">
<Volume2Icon className="w-5 h-5" />
</Button>
</div>
{onPlayOrigin && (
<div className="flex items-center space-x-2">
<span className="text-sm">{t("originalPronunciation")}:</span>
<Button onClick={onPlayOrigin} variant="ghost" size="icon">
<Volume2Icon className="w-5 h-5" />
</Button>
</div>
)}
</PopoverContent>
</Popover>
);

View File

@@ -14,8 +14,9 @@ export const RecordingDetail = (props: {
recording: RecordingType;
pronunciationAssessment?: PronunciationAssessmentType;
onAssess?: (assessment: PronunciationAssessmentType) => void;
onPlayOrigin?: (word: string) => void;
}) => {
const { recording, onAssess } = props;
const { recording, onAssess, onPlayOrigin } = props;
if (!recording) return;
const [pronunciationAssessment, setPronunciationAssessment] =
@@ -40,7 +41,7 @@ export const RecordingDetail = (props: {
setAssessing(true);
createAssessment({
recording,
reference: recording.referenceText || "",
reference: recording.referenceText?.replace(/[—]/g, ", ") || "",
language: recording.language || learningLanguage,
})
.then((assessment) => {
@@ -76,6 +77,7 @@ export const RecordingDetail = (props: {
words={result.words}
currentTime={currentTime}
src={recording.src}
onPlayOrigin={onPlayOrigin}
/>
) : (
<ScrollArea className="min-h-72 py-4 px-8 select-text">

View File

@@ -1,4 +1,4 @@
import { createContext, useEffect, useState, useContext } from "react";
import { createContext, useEffect, useState, useContext, useMemo } from "react";
import { convertIpaToNormal, extractFrequencies } from "@/utils";
import { AppSettingsProviderContext } from "@renderer/context";
import {
@@ -12,7 +12,10 @@ import Regions, {
type Region as RegionType,
} from "wavesurfer.js/dist/plugins/regions";
import Chart from "chart.js/auto";
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
import {
Timeline,
TimelineEntry,
} from "echogarden/dist/utilities/Timeline.d.js";
import { toast } from "@renderer/components/ui";
import { Tooltip } from "react-tooltip";
import { useAudioRecorder } from "react-audio-voice-recorder";
@@ -48,6 +51,7 @@ type MediaShadowContextType = {
regions: Regions | null;
activeRegion: RegionType;
setActiveRegion: (region: RegionType) => void;
toggleRegion: (params: number[]) => void;
renderPitchContour: (
region: RegionType,
options?: {
@@ -74,6 +78,7 @@ type MediaShadowContextType = {
transcribingOutput: string;
transcriptionDraft: TranscriptionType["result"];
setTranscriptionDraft: (result: TranscriptionType["result"]) => void;
caption: TimelineEntry;
// Recordings
startRecording: () => void;
stopRecording: () => void;
@@ -180,6 +185,10 @@ export const MediaShadowProvider = ({
toast.error(exception.message);
});
const caption = useMemo(() => {
return (transcription?.result?.timeline as Timeline)?.[currentSegmentIndex];
}, [currentSegmentIndex, transcription]);
const { segment, createSegment } = useSegments({
targetId: media?.id,
targetType: media?.mediaType,
@@ -466,6 +475,67 @@ export const MediaShadowProvider = ({
);
};
const toggleRegion = (params: number[]) => {
if (!activeRegion) return;
if (editingRegion) {
toast.warning(t("currentRegionIsBeingEdited"));
return;
}
if (params.length === 0) {
if (activeRegion.id.startsWith("word-region")) {
activeRegion.remove();
setActiveRegion(
regions.getRegions().find((r) => r.id.startsWith("segment-region"))
);
}
return;
}
const startIndex = Math.min(...params);
const endIndex = Math.max(...params);
const startWord = caption.timeline[startIndex];
if (!startWord) return;
const endWord = caption.timeline[endIndex] || startWord;
const start = startWord.startTime;
const end = endWord.endTime;
// If the active region is a word region, then merge the selected words into a single region.
if (activeRegion.id.startsWith("word-region")) {
activeRegion.remove();
const region = regions.addRegion({
id: `word-region-${startIndex}`,
start,
end,
color: "#fb6f9233",
drag: false,
resize: editingRegion,
});
setActiveRegion(region);
// If the active region is a meaning group region, then active the segment region.
} else if (activeRegion.id.startsWith("meaning-group-region")) {
setActiveRegion(
regions.getRegions().find((r) => r.id.startsWith("segment-region"))
);
// If the active region is a segment region, then create a new word region.
} else {
const region = regions.addRegion({
id: `word-region-${startIndex}`,
start,
end,
color: "#fb6f9233",
drag: false,
resize: false,
});
setActiveRegion(region);
}
};
/*
* When wavesurfer is decoded,
* set up event listeners for wavesurfer
@@ -667,6 +737,7 @@ export const MediaShadowProvider = ({
pitchChart,
activeRegion,
setActiveRegion,
toggleRegion,
renderPitchContour,
editingRegion,
setEditingRegion,
@@ -676,6 +747,7 @@ export const MediaShadowProvider = ({
transcribingOutput,
transcriptionDraft,
setTranscriptionDraft,
caption,
startRecording,
stopRecording,
cancelRecording,