Improve transcription (#1182)
* handle special character in transcription when assess * may play original pronounciation in assessment result
This commit is contained in:
@@ -927,5 +927,7 @@
|
||||
"openaiTtsServiceDescription": "Use OpenAI TTS service from your own key.",
|
||||
"enjoyTtsServiceDescription": "Use TTS service provided by Enjoy. OpenAI or Azure is supported.",
|
||||
"compressMediaBeforeAdding": "Compress media before adding",
|
||||
"keepOriginalMedia": "Keep original media"
|
||||
"keepOriginalMedia": "Keep original media",
|
||||
"myPronunciation": "My pronunciation",
|
||||
"originalPronunciation": "Original pronunciation"
|
||||
}
|
||||
|
||||
@@ -927,5 +927,7 @@
|
||||
"openaiTtsServiceDescription": "使用您自己的 API key 来使用 OpenAI TTS 服务。",
|
||||
"enjoyTtsServiceDescription": "使用 Enjoy 提供的 TTS 服务,支持 OpenAI 或 Azure。",
|
||||
"compressMediaBeforeAdding": "添加前压缩媒体",
|
||||
"keepOriginalMedia": "保存原始媒体"
|
||||
"keepOriginalMedia": "保存原始媒体",
|
||||
"myPronunciation": "我的发音",
|
||||
"originalPronunciation": "原始发音"
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { useEffect, useContext, useRef, useState } from "react";
|
||||
import { useEffect, useContext, useRef, useState, useMemo } from "react";
|
||||
import {
|
||||
AppSettingsProviderContext,
|
||||
HotKeysSettingsProviderContext,
|
||||
@@ -50,17 +50,13 @@ import { formatDuration } from "@renderer/lib/utils";
|
||||
import { useHotkeys } from "react-hotkeys-hook";
|
||||
import { LiveAudioVisualizer } from "react-audio-visualize";
|
||||
import debounce from "lodash/debounce";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
|
||||
|
||||
const ACTION_BUTTON_HEIGHT = 35;
|
||||
export const MediaCurrentRecording = () => {
|
||||
const {
|
||||
isRecording,
|
||||
isPaused,
|
||||
cancelRecording,
|
||||
togglePauseResume,
|
||||
stopRecording,
|
||||
recordingTime,
|
||||
mediaRecorder,
|
||||
currentRecording,
|
||||
renderPitchContour: renderMediaPitchContour,
|
||||
regions: mediaRegions,
|
||||
@@ -71,6 +67,8 @@ export const MediaCurrentRecording = () => {
|
||||
currentSegment,
|
||||
createSegment,
|
||||
currentTime: mediaCurrentTime,
|
||||
caption,
|
||||
toggleRegion,
|
||||
} = useContext(MediaShadowProviderContext);
|
||||
const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext);
|
||||
const { currentHotkeys } = useContext(HotKeysSettingsProviderContext);
|
||||
@@ -263,6 +261,23 @@ export const MediaCurrentRecording = () => {
|
||||
});
|
||||
};
|
||||
|
||||
const playWord = (word: string, index: number) => {
|
||||
const candidates = caption.timeline.filter(
|
||||
(w: TimelineEntry) => w.text.toLowerCase() === word.toLowerCase()
|
||||
);
|
||||
const target = candidates[index];
|
||||
if (!target) return;
|
||||
|
||||
const wordIndex = caption.timeline.findIndex(
|
||||
(w) => w.startTime === target.startTime
|
||||
);
|
||||
|
||||
toggleRegion([wordIndex]);
|
||||
setTimeout(() => {
|
||||
wavesurfer?.playPause();
|
||||
}, 250);
|
||||
};
|
||||
|
||||
const calContainerSize = () => {
|
||||
const size = ref?.current
|
||||
?.closest(".media-recording-wrapper")
|
||||
@@ -685,7 +700,12 @@ export const MediaCurrentRecording = () => {
|
||||
</SheetClose>
|
||||
</SheetHeader>
|
||||
|
||||
<RecordingDetail recording={currentRecording} />
|
||||
<RecordingDetail
|
||||
recording={currentRecording}
|
||||
onPlayOrigin={(word: string, index: number = 0) =>
|
||||
playWord(word, index)
|
||||
}
|
||||
/>
|
||||
</SheetContent>
|
||||
</Sheet>
|
||||
</div>
|
||||
@@ -745,7 +765,6 @@ const MediaRecorder = () => {
|
||||
const {
|
||||
mediaRecorder,
|
||||
recordingTime,
|
||||
isRecording,
|
||||
isPaused,
|
||||
cancelRecording,
|
||||
togglePauseResume,
|
||||
|
||||
@@ -318,7 +318,7 @@ export const MediaPlayerControls = () => {
|
||||
wavesurfer.pause();
|
||||
setTimeout(() => {
|
||||
activeRegionDebouncePlay();
|
||||
}, 500);
|
||||
}, 250);
|
||||
} else if (playMode === "single") {
|
||||
wavesurfer.pause();
|
||||
}
|
||||
|
||||
@@ -34,7 +34,13 @@ export const MediaCaption = (props: {
|
||||
|
||||
const [notedquoteIndices, setNotedquoteIndices] = useState<number[]>([]);
|
||||
|
||||
let words = caption.text.split(" ");
|
||||
let words = caption.text
|
||||
.replace(/ ([.,!?:;])/g, "$1")
|
||||
.replace(/ (['"")])/g, "$1")
|
||||
.replace(/ \.\.\./g, "...")
|
||||
.split(/([—]|\s+)/g)
|
||||
.filter((word) => word.trim() !== "" && word !== "—");
|
||||
|
||||
const ipas = caption.timeline.map((w) =>
|
||||
w.timeline?.map((t) =>
|
||||
t.timeline && language.startsWith("en")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { useEffect, useState, useContext, useRef } from "react";
|
||||
import { useEffect, useState, useContext, useRef, useMemo } from "react";
|
||||
import { MediaShadowProviderContext } from "@renderer/context";
|
||||
import cloneDeep from "lodash/cloneDeep";
|
||||
import {
|
||||
@@ -11,10 +11,6 @@ import {
|
||||
} from "@renderer/components/ui";
|
||||
import { MediaCaption, MediaCaptionActions } from "@renderer/components";
|
||||
import { t } from "i18next";
|
||||
import {
|
||||
Timeline,
|
||||
TimelineEntry,
|
||||
} from "echogarden/dist/utilities/Timeline.d.js";
|
||||
import {
|
||||
MediaCaptionAnalysis,
|
||||
MediaCaptionNote,
|
||||
@@ -29,12 +25,14 @@ export const MediaRightPanel = (props: {
|
||||
}) => {
|
||||
const { className, setDisplayPanel } = props;
|
||||
const {
|
||||
caption,
|
||||
currentSegmentIndex,
|
||||
currentTime,
|
||||
transcription,
|
||||
regions,
|
||||
activeRegion,
|
||||
setActiveRegion,
|
||||
toggleRegion,
|
||||
editingRegion,
|
||||
setEditingRegion,
|
||||
setTranscriptionDraft,
|
||||
@@ -47,7 +45,6 @@ export const MediaRightPanel = (props: {
|
||||
const [displayIpa, setDisplayIpa] = useState<boolean>(true);
|
||||
const [displayNotes, setDisplayNotes] = useState<boolean>(true);
|
||||
|
||||
const [caption, setCaption] = useState<TimelineEntry | null>(null);
|
||||
const [tab, setTab] = useState<string>("translation");
|
||||
|
||||
const toggleMultiSelect = (event: KeyboardEvent) => {
|
||||
@@ -79,67 +76,6 @@ export const MediaRightPanel = (props: {
|
||||
}
|
||||
};
|
||||
|
||||
const toggleRegion = (params: number[]) => {
|
||||
if (!activeRegion) return;
|
||||
if (editingRegion) {
|
||||
toast.warning(t("currentRegionIsBeingEdited"));
|
||||
return;
|
||||
}
|
||||
if (params.length === 0) {
|
||||
if (activeRegion.id.startsWith("word-region")) {
|
||||
activeRegion.remove();
|
||||
setActiveRegion(
|
||||
regions.getRegions().find((r) => r.id.startsWith("segment-region"))
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const startIndex = Math.min(...params);
|
||||
const endIndex = Math.max(...params);
|
||||
|
||||
const startWord = caption.timeline[startIndex];
|
||||
if (!startWord) return;
|
||||
|
||||
const endWord = caption.timeline[endIndex] || startWord;
|
||||
|
||||
const start = startWord.startTime;
|
||||
const end = endWord.endTime;
|
||||
|
||||
// If the active region is a word region, then merge the selected words into a single region.
|
||||
if (activeRegion.id.startsWith("word-region")) {
|
||||
activeRegion.remove();
|
||||
|
||||
const region = regions.addRegion({
|
||||
id: `word-region-${startIndex}`,
|
||||
start,
|
||||
end,
|
||||
color: "#fb6f9233",
|
||||
drag: false,
|
||||
resize: editingRegion,
|
||||
});
|
||||
|
||||
setActiveRegion(region);
|
||||
// If the active region is a meaning group region, then active the segment region.
|
||||
} else if (activeRegion.id.startsWith("meaning-group-region")) {
|
||||
setActiveRegion(
|
||||
regions.getRegions().find((r) => r.id.startsWith("segment-region"))
|
||||
);
|
||||
// If the active region is a segment region, then create a new word region.
|
||||
} else {
|
||||
const region = regions.addRegion({
|
||||
id: `word-region-${startIndex}`,
|
||||
start,
|
||||
end,
|
||||
color: "#fb6f9233",
|
||||
drag: false,
|
||||
resize: false,
|
||||
});
|
||||
|
||||
setActiveRegion(region);
|
||||
}
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
if (!caption) return;
|
||||
|
||||
@@ -160,6 +96,7 @@ export const MediaRightPanel = (props: {
|
||||
toggleRegion(selectedIndices);
|
||||
}, [caption, selectedIndices]);
|
||||
|
||||
// Edit region to update transcription draft
|
||||
useEffect(() => {
|
||||
if (!activeRegion) return;
|
||||
if (!activeRegion.id.startsWith("word-region")) return;
|
||||
@@ -234,12 +171,6 @@ export const MediaRightPanel = (props: {
|
||||
};
|
||||
}, [editingRegion]);
|
||||
|
||||
useEffect(() => {
|
||||
setCaption(
|
||||
(transcription?.result?.timeline as Timeline)?.[currentSegmentIndex]
|
||||
);
|
||||
}, [currentSegmentIndex, transcription]);
|
||||
|
||||
useEffect(() => {
|
||||
return () => setSelectedIndices([]);
|
||||
}, [caption]);
|
||||
|
||||
@@ -4,16 +4,13 @@ import {
|
||||
MediaRightPanel,
|
||||
MediaLeftPanel,
|
||||
MediaBottomPanel,
|
||||
MediaProvider,
|
||||
} from "@renderer/components";
|
||||
import {
|
||||
Button,
|
||||
ResizableHandle,
|
||||
ResizablePanel,
|
||||
ResizablePanelGroup,
|
||||
} from "@renderer/components/ui";
|
||||
import { useContext, useState } from "react";
|
||||
import { RefreshCcwDotIcon } from "lucide-react";
|
||||
|
||||
export const MediaShadowPlayer = () => {
|
||||
return (
|
||||
|
||||
@@ -8,8 +8,9 @@ export const PronunciationAssessmentFulltextResult = (props: {
|
||||
words: PronunciationAssessmentWordResultType[];
|
||||
currentTime?: number;
|
||||
src?: string;
|
||||
onPlayOrigin?: (word: string, index: number) => void;
|
||||
}) => {
|
||||
const { words, currentTime, src } = props;
|
||||
const { words, currentTime, src, onPlayOrigin } = props;
|
||||
const [errorStats, setErrorStats] = useState({
|
||||
mispronunciation: 0,
|
||||
omission: 0,
|
||||
@@ -65,6 +66,16 @@ export const PronunciationAssessmentFulltextResult = (props: {
|
||||
errorDisplay={errorDisplay}
|
||||
currentTime={currentTime}
|
||||
src={src}
|
||||
onPlayOrigin={() => {
|
||||
if (!onPlayOrigin) return;
|
||||
|
||||
const word = words[index];
|
||||
const candidates = words.filter((w) => w.word === word.word);
|
||||
const wordIndex = candidates.findIndex(
|
||||
(w) => w.offset === word.offset
|
||||
);
|
||||
onPlayOrigin(word.word, wordIndex);
|
||||
}}
|
||||
/>
|
||||
))}
|
||||
</div>
|
||||
|
||||
@@ -20,6 +20,7 @@ export const PronunciationAssessmentWordResult = (props: {
|
||||
monotone: boolean;
|
||||
};
|
||||
currentTime?: number;
|
||||
onPlayOrigin?: () => void;
|
||||
}) => {
|
||||
const {
|
||||
result,
|
||||
@@ -32,6 +33,7 @@ export const PronunciationAssessmentWordResult = (props: {
|
||||
monotone: true,
|
||||
},
|
||||
currentTime = 0,
|
||||
onPlayOrigin,
|
||||
} = props;
|
||||
|
||||
const audio = useRef<HTMLAudioElement>(null);
|
||||
@@ -71,25 +73,41 @@ export const PronunciationAssessmentWordResult = (props: {
|
||||
}[result.pronunciationAssessment.errorType];
|
||||
|
||||
const play = () => {
|
||||
const { offset, duration } = result;
|
||||
if (!audio.current || !props.src) return;
|
||||
|
||||
// create a new audio element and play the segment
|
||||
audio.current.src = `${props.src}#t=${(offset * 1.0) / 1e7},${
|
||||
((offset + duration) * 1.0) / 1e7
|
||||
}`;
|
||||
const { offset, duration } = result;
|
||||
if (!offset || !duration) return;
|
||||
|
||||
const startTime = (offset * 1.0) / 1e7;
|
||||
const endTime = ((offset + duration) * 1.0) / 1e7;
|
||||
|
||||
audio.current.currentTime = startTime;
|
||||
|
||||
// Add timeupdate listener to stop at the end of the segment
|
||||
const handleTimeUpdate = () => {
|
||||
if (audio.current.currentTime >= endTime) {
|
||||
audio.current.pause();
|
||||
audio.current.removeEventListener("timeupdate", handleTimeUpdate);
|
||||
}
|
||||
};
|
||||
|
||||
audio.current.addEventListener("timeupdate", handleTimeUpdate);
|
||||
audio.current.play();
|
||||
};
|
||||
|
||||
useEffect(() => {
|
||||
if (!audio.current) {
|
||||
audio.current = new Audio();
|
||||
audio.current = new Audio(props.src);
|
||||
}
|
||||
|
||||
return () => {
|
||||
audio.current?.pause();
|
||||
delete audio.current;
|
||||
if (audio.current) {
|
||||
audio.current.pause();
|
||||
audio.current.removeEventListener("timeupdate", () => {});
|
||||
audio.current = null;
|
||||
}
|
||||
};
|
||||
}, []);
|
||||
}, [props.src]);
|
||||
|
||||
return (
|
||||
<Popover>
|
||||
@@ -152,11 +170,20 @@ export const PronunciationAssessmentWordResult = (props: {
|
||||
</div>
|
||||
)}
|
||||
|
||||
<div className="">
|
||||
<div className="flex items-center space-x-2">
|
||||
<span className="text-sm">{t("myPronunciation")}:</span>
|
||||
<Button onClick={play} variant="ghost" size="icon">
|
||||
<Volume2Icon className="w-5 h-5" />
|
||||
</Button>
|
||||
</div>
|
||||
{onPlayOrigin && (
|
||||
<div className="flex items-center space-x-2">
|
||||
<span className="text-sm">{t("originalPronunciation")}:</span>
|
||||
<Button onClick={onPlayOrigin} variant="ghost" size="icon">
|
||||
<Volume2Icon className="w-5 h-5" />
|
||||
</Button>
|
||||
</div>
|
||||
)}
|
||||
</PopoverContent>
|
||||
</Popover>
|
||||
);
|
||||
|
||||
@@ -14,8 +14,9 @@ export const RecordingDetail = (props: {
|
||||
recording: RecordingType;
|
||||
pronunciationAssessment?: PronunciationAssessmentType;
|
||||
onAssess?: (assessment: PronunciationAssessmentType) => void;
|
||||
onPlayOrigin?: (word: string) => void;
|
||||
}) => {
|
||||
const { recording, onAssess } = props;
|
||||
const { recording, onAssess, onPlayOrigin } = props;
|
||||
if (!recording) return;
|
||||
|
||||
const [pronunciationAssessment, setPronunciationAssessment] =
|
||||
@@ -40,7 +41,7 @@ export const RecordingDetail = (props: {
|
||||
setAssessing(true);
|
||||
createAssessment({
|
||||
recording,
|
||||
reference: recording.referenceText || "",
|
||||
reference: recording.referenceText?.replace(/[—]/g, ", ") || "",
|
||||
language: recording.language || learningLanguage,
|
||||
})
|
||||
.then((assessment) => {
|
||||
@@ -76,6 +77,7 @@ export const RecordingDetail = (props: {
|
||||
words={result.words}
|
||||
currentTime={currentTime}
|
||||
src={recording.src}
|
||||
onPlayOrigin={onPlayOrigin}
|
||||
/>
|
||||
) : (
|
||||
<ScrollArea className="min-h-72 py-4 px-8 select-text">
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
import { createContext, useEffect, useState, useContext } from "react";
|
||||
import { createContext, useEffect, useState, useContext, useMemo } from "react";
|
||||
import { convertIpaToNormal, extractFrequencies } from "@/utils";
|
||||
import { AppSettingsProviderContext } from "@renderer/context";
|
||||
import {
|
||||
@@ -12,7 +12,10 @@ import Regions, {
|
||||
type Region as RegionType,
|
||||
} from "wavesurfer.js/dist/plugins/regions";
|
||||
import Chart from "chart.js/auto";
|
||||
import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
|
||||
import {
|
||||
Timeline,
|
||||
TimelineEntry,
|
||||
} from "echogarden/dist/utilities/Timeline.d.js";
|
||||
import { toast } from "@renderer/components/ui";
|
||||
import { Tooltip } from "react-tooltip";
|
||||
import { useAudioRecorder } from "react-audio-voice-recorder";
|
||||
@@ -48,6 +51,7 @@ type MediaShadowContextType = {
|
||||
regions: Regions | null;
|
||||
activeRegion: RegionType;
|
||||
setActiveRegion: (region: RegionType) => void;
|
||||
toggleRegion: (params: number[]) => void;
|
||||
renderPitchContour: (
|
||||
region: RegionType,
|
||||
options?: {
|
||||
@@ -74,6 +78,7 @@ type MediaShadowContextType = {
|
||||
transcribingOutput: string;
|
||||
transcriptionDraft: TranscriptionType["result"];
|
||||
setTranscriptionDraft: (result: TranscriptionType["result"]) => void;
|
||||
caption: TimelineEntry;
|
||||
// Recordings
|
||||
startRecording: () => void;
|
||||
stopRecording: () => void;
|
||||
@@ -180,6 +185,10 @@ export const MediaShadowProvider = ({
|
||||
toast.error(exception.message);
|
||||
});
|
||||
|
||||
const caption = useMemo(() => {
|
||||
return (transcription?.result?.timeline as Timeline)?.[currentSegmentIndex];
|
||||
}, [currentSegmentIndex, transcription]);
|
||||
|
||||
const { segment, createSegment } = useSegments({
|
||||
targetId: media?.id,
|
||||
targetType: media?.mediaType,
|
||||
@@ -466,6 +475,67 @@ export const MediaShadowProvider = ({
|
||||
);
|
||||
};
|
||||
|
||||
const toggleRegion = (params: number[]) => {
|
||||
if (!activeRegion) return;
|
||||
if (editingRegion) {
|
||||
toast.warning(t("currentRegionIsBeingEdited"));
|
||||
return;
|
||||
}
|
||||
if (params.length === 0) {
|
||||
if (activeRegion.id.startsWith("word-region")) {
|
||||
activeRegion.remove();
|
||||
setActiveRegion(
|
||||
regions.getRegions().find((r) => r.id.startsWith("segment-region"))
|
||||
);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
const startIndex = Math.min(...params);
|
||||
const endIndex = Math.max(...params);
|
||||
|
||||
const startWord = caption.timeline[startIndex];
|
||||
if (!startWord) return;
|
||||
|
||||
const endWord = caption.timeline[endIndex] || startWord;
|
||||
|
||||
const start = startWord.startTime;
|
||||
const end = endWord.endTime;
|
||||
|
||||
// If the active region is a word region, then merge the selected words into a single region.
|
||||
if (activeRegion.id.startsWith("word-region")) {
|
||||
activeRegion.remove();
|
||||
|
||||
const region = regions.addRegion({
|
||||
id: `word-region-${startIndex}`,
|
||||
start,
|
||||
end,
|
||||
color: "#fb6f9233",
|
||||
drag: false,
|
||||
resize: editingRegion,
|
||||
});
|
||||
|
||||
setActiveRegion(region);
|
||||
// If the active region is a meaning group region, then active the segment region.
|
||||
} else if (activeRegion.id.startsWith("meaning-group-region")) {
|
||||
setActiveRegion(
|
||||
regions.getRegions().find((r) => r.id.startsWith("segment-region"))
|
||||
);
|
||||
// If the active region is a segment region, then create a new word region.
|
||||
} else {
|
||||
const region = regions.addRegion({
|
||||
id: `word-region-${startIndex}`,
|
||||
start,
|
||||
end,
|
||||
color: "#fb6f9233",
|
||||
drag: false,
|
||||
resize: false,
|
||||
});
|
||||
|
||||
setActiveRegion(region);
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* When wavesurfer is decoded,
|
||||
* set up event listeners for wavesurfer
|
||||
@@ -667,6 +737,7 @@ export const MediaShadowProvider = ({
|
||||
pitchChart,
|
||||
activeRegion,
|
||||
setActiveRegion,
|
||||
toggleRegion,
|
||||
renderPitchContour,
|
||||
editingRegion,
|
||||
setEditingRegion,
|
||||
@@ -676,6 +747,7 @@ export const MediaShadowProvider = ({
|
||||
transcribingOutput,
|
||||
transcriptionDraft,
|
||||
setTranscriptionDraft,
|
||||
caption,
|
||||
startRecording,
|
||||
stopRecording,
|
||||
cancelRecording,
|
||||
|
||||
Reference in New Issue
Block a user