Improve transcription (#1182)

* handle special character in transcription when assess * may play original pronounciation in assessment result
2024-11-15 15:09:07 +08:00
parent 1f531b0cbc
commit b8167a99d8
11 changed files with 172 additions and 103 deletions
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -927,5 +927,7 @@
  "openaiTtsServiceDescription": "Use OpenAI TTS service from your own key.",
  "enjoyTtsServiceDescription": "Use TTS service provided by Enjoy. OpenAI or Azure is supported.",
  "compressMediaBeforeAdding": "Compress media before adding",
-  "keepOriginalMedia": "Keep original media"
+  "keepOriginalMedia": "Keep original media",
+  "myPronunciation": "My pronunciation",
+  "originalPronunciation": "Original pronunciation"
 }
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -927,5 +927,7 @@
  "openaiTtsServiceDescription": "使用您自己的 API key 来使用 OpenAI TTS 服务。",
  "enjoyTtsServiceDescription": "使用 Enjoy 提供的 TTS 服务，支持 OpenAI 或 Azure。",
  "compressMediaBeforeAdding": "添加前压缩媒体",
-  "keepOriginalMedia": "保存原始媒体"
+  "keepOriginalMedia": "保存原始媒体",
+  "myPronunciation": "我的发音",
+  "originalPronunciation": "原始发音"
 }
--- a/enjoy/src/renderer/components/medias/media-bottom-panel/media-current-recording.tsx
+++ b/enjoy/src/renderer/components/medias/media-bottom-panel/media-current-recording.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useContext, useRef, useState } from "react";
+import { useEffect, useContext, useRef, useState, useMemo } from "react";
 import {
  AppSettingsProviderContext,
  HotKeysSettingsProviderContext,
@@ -50,17 +50,13 @@ import { formatDuration } from "@renderer/lib/utils";
 import { useHotkeys } from "react-hotkeys-hook";
 import { LiveAudioVisualizer } from "react-audio-visualize";
 import debounce from "lodash/debounce";
+import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";

 const ACTION_BUTTON_HEIGHT = 35;
 export const MediaCurrentRecording = () => {
  const {
    isRecording,
    isPaused,
-    cancelRecording,
-    togglePauseResume,
-    stopRecording,
-    recordingTime,
-    mediaRecorder,
    currentRecording,
    renderPitchContour: renderMediaPitchContour,
    regions: mediaRegions,
@@ -71,6 +67,8 @@ export const MediaCurrentRecording = () => {
    currentSegment,
    createSegment,
    currentTime: mediaCurrentTime,
+    caption,
+    toggleRegion,
  } = useContext(MediaShadowProviderContext);
  const { webApi, EnjoyApp } = useContext(AppSettingsProviderContext);
  const { currentHotkeys } = useContext(HotKeysSettingsProviderContext);
@@ -263,6 +261,23 @@ export const MediaCurrentRecording = () => {
      });
  };

+  const playWord = (word: string, index: number) => {
+    const candidates = caption.timeline.filter(
+      (w: TimelineEntry) => w.text.toLowerCase() === word.toLowerCase()
+    );
+    const target = candidates[index];
+    if (!target) return;
+
+    const wordIndex = caption.timeline.findIndex(
+      (w) => w.startTime === target.startTime
+    );
+
+    toggleRegion([wordIndex]);
+    setTimeout(() => {
+      wavesurfer?.playPause();
+    }, 250);
+  };
+
  const calContainerSize = () => {
    const size = ref?.current
      ?.closest(".media-recording-wrapper")
@@ -685,7 +700,12 @@ export const MediaCurrentRecording = () => {
            </SheetClose>
          </SheetHeader>

-          <RecordingDetail recording={currentRecording} />
+          <RecordingDetail
+            recording={currentRecording}
+            onPlayOrigin={(word: string, index: number = 0) =>
+              playWord(word, index)
+            }
+          />
        </SheetContent>
      </Sheet>
    </div>
@@ -745,7 +765,6 @@ const MediaRecorder = () => {
  const {
    mediaRecorder,
    recordingTime,
-    isRecording,
    isPaused,
    cancelRecording,
    togglePauseResume,
--- a/enjoy/src/renderer/components/medias/media-bottom-panel/media-player-controls.tsx
+++ b/enjoy/src/renderer/components/medias/media-bottom-panel/media-player-controls.tsx
@@ -318,7 +318,7 @@ export const MediaPlayerControls = () => {
          wavesurfer.pause();
          setTimeout(() => {
            activeRegionDebouncePlay();
-          }, 500);
+          }, 250);
        } else if (playMode === "single") {
          wavesurfer.pause();
        }
--- a/enjoy/src/renderer/components/medias/media-right-panel/media-caption.tsx
+++ b/enjoy/src/renderer/components/medias/media-right-panel/media-caption.tsx
@@ -34,7 +34,13 @@ export const MediaCaption = (props: {

  const [notedquoteIndices, setNotedquoteIndices] = useState<number[]>([]);

-  let words = caption.text.split(" ");
+  let words = caption.text
+    .replace(/ ([.,!?:;])/g, "$1")
+    .replace(/ (['"")])/g, "$1")
+    .replace(/ \.\.\./g, "...")
+    .split(/([—]|\s+)/g)
+    .filter((word) => word.trim() !== "" && word !== "—");
+
  const ipas = caption.timeline.map((w) =>
    w.timeline?.map((t) =>
      t.timeline && language.startsWith("en")
--- a/enjoy/src/renderer/components/medias/media-right-panel/media-right-panel.tsx
+++ b/enjoy/src/renderer/components/medias/media-right-panel/media-right-panel.tsx
@@ -1,4 +1,4 @@
-import { useEffect, useState, useContext, useRef } from "react";
+import { useEffect, useState, useContext, useRef, useMemo } from "react";
 import { MediaShadowProviderContext } from "@renderer/context";
 import cloneDeep from "lodash/cloneDeep";
 import {
@@ -11,10 +11,6 @@ import {
 } from "@renderer/components/ui";
 import { MediaCaption, MediaCaptionActions } from "@renderer/components";
 import { t } from "i18next";
-import {
-  Timeline,
-  TimelineEntry,
-} from "echogarden/dist/utilities/Timeline.d.js";
 import {
  MediaCaptionAnalysis,
  MediaCaptionNote,
@@ -29,12 +25,14 @@ export const MediaRightPanel = (props: {
 }) => {
  const { className, setDisplayPanel } = props;
  const {
+    caption,
    currentSegmentIndex,
    currentTime,
    transcription,
    regions,
    activeRegion,
    setActiveRegion,
+    toggleRegion,
    editingRegion,
    setEditingRegion,
    setTranscriptionDraft,
@@ -47,7 +45,6 @@ export const MediaRightPanel = (props: {
  const [displayIpa, setDisplayIpa] = useState<boolean>(true);
  const [displayNotes, setDisplayNotes] = useState<boolean>(true);

-  const [caption, setCaption] = useState<TimelineEntry | null>(null);
  const [tab, setTab] = useState<string>("translation");

  const toggleMultiSelect = (event: KeyboardEvent) => {
@@ -79,67 +76,6 @@ export const MediaRightPanel = (props: {
    }
  };

-  const toggleRegion = (params: number[]) => {
-    if (!activeRegion) return;
-    if (editingRegion) {
-      toast.warning(t("currentRegionIsBeingEdited"));
-      return;
-    }
-    if (params.length === 0) {
-      if (activeRegion.id.startsWith("word-region")) {
-        activeRegion.remove();
-        setActiveRegion(
-          regions.getRegions().find((r) => r.id.startsWith("segment-region"))
-        );
-      }
-      return;
-    }
-
-    const startIndex = Math.min(...params);
-    const endIndex = Math.max(...params);
-
-    const startWord = caption.timeline[startIndex];
-    if (!startWord) return;
-
-    const endWord = caption.timeline[endIndex] || startWord;
-
-    const start = startWord.startTime;
-    const end = endWord.endTime;
-
-    // If the active region is a word region, then merge the selected words into a single region.
-    if (activeRegion.id.startsWith("word-region")) {
-      activeRegion.remove();
-
-      const region = regions.addRegion({
-        id: `word-region-${startIndex}`,
-        start,
-        end,
-        color: "#fb6f9233",
-        drag: false,
-        resize: editingRegion,
-      });
-
-      setActiveRegion(region);
-      // If the active region is a meaning group region, then active the segment region.
-    } else if (activeRegion.id.startsWith("meaning-group-region")) {
-      setActiveRegion(
-        regions.getRegions().find((r) => r.id.startsWith("segment-region"))
-      );
-      // If the active region is a segment region, then create a new word region.
-    } else {
-      const region = regions.addRegion({
-        id: `word-region-${startIndex}`,
-        start,
-        end,
-        color: "#fb6f9233",
-        drag: false,
-        resize: false,
-      });
-
-      setActiveRegion(region);
-    }
-  };
-
  useEffect(() => {
    if (!caption) return;

@@ -160,6 +96,7 @@ export const MediaRightPanel = (props: {
    toggleRegion(selectedIndices);
  }, [caption, selectedIndices]);

+  // Edit region to update transcription draft
  useEffect(() => {
    if (!activeRegion) return;
    if (!activeRegion.id.startsWith("word-region")) return;
@@ -234,12 +171,6 @@ export const MediaRightPanel = (props: {
    };
  }, [editingRegion]);

-  useEffect(() => {
-    setCaption(
-      (transcription?.result?.timeline as Timeline)?.[currentSegmentIndex]
-    );
-  }, [currentSegmentIndex, transcription]);
-
  useEffect(() => {
    return () => setSelectedIndices([]);
  }, [caption]);
--- a/enjoy/src/renderer/components/medias/media-shadow-player.tsx
+++ b/enjoy/src/renderer/components/medias/media-shadow-player.tsx
@@ -4,16 +4,13 @@ import {
  MediaRightPanel,
  MediaLeftPanel,
  MediaBottomPanel,
-  MediaProvider,
 } from "@renderer/components";
 import {
-  Button,
  ResizableHandle,
  ResizablePanel,
  ResizablePanelGroup,
 } from "@renderer/components/ui";
 import { useContext, useState } from "react";
-import { RefreshCcwDotIcon } from "lucide-react";

 export const MediaShadowPlayer = () => {
  return (
--- a/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-fulltext-result.tsx
+++ b/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-fulltext-result.tsx
@@ -8,8 +8,9 @@ export const PronunciationAssessmentFulltextResult = (props: {
  words: PronunciationAssessmentWordResultType[];
  currentTime?: number;
  src?: string;
+  onPlayOrigin?: (word: string, index: number) => void;
 }) => {
-  const { words, currentTime, src } = props;
+  const { words, currentTime, src, onPlayOrigin } = props;
  const [errorStats, setErrorStats] = useState({
    mispronunciation: 0,
    omission: 0,
@@ -65,6 +66,16 @@ export const PronunciationAssessmentFulltextResult = (props: {
              errorDisplay={errorDisplay}
              currentTime={currentTime}
              src={src}
+              onPlayOrigin={() => {
+                if (!onPlayOrigin) return;
+
+                const word = words[index];
+                const candidates = words.filter((w) => w.word === word.word);
+                const wordIndex = candidates.findIndex(
+                  (w) => w.offset === word.offset
+                );
+                onPlayOrigin(word.word, wordIndex);
+              }}
            />
          ))}
        </div>
--- a/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-word-result.tsx
+++ b/enjoy/src/renderer/components/pronunciation-assessments/pronunciation-assessment-word-result.tsx
@@ -20,6 +20,7 @@ export const PronunciationAssessmentWordResult = (props: {
    monotone: boolean;
  };
  currentTime?: number;
+  onPlayOrigin?: () => void;
 }) => {
  const {
    result,
@@ -32,6 +33,7 @@ export const PronunciationAssessmentWordResult = (props: {
      monotone: true,
    },
    currentTime = 0,
+    onPlayOrigin,
  } = props;

  const audio = useRef<HTMLAudioElement>(null);
@@ -71,25 +73,41 @@ export const PronunciationAssessmentWordResult = (props: {
  }[result.pronunciationAssessment.errorType];

  const play = () => {
-    const { offset, duration } = result;
+    if (!audio.current || !props.src) return;

-    // create a new audio element and play the segment
-    audio.current.src = `${props.src}#t=${(offset * 1.0) / 1e7},${
-      ((offset + duration) * 1.0) / 1e7
-    }`;
+    const { offset, duration } = result;
+    if (!offset || !duration) return;
+
+    const startTime = (offset * 1.0) / 1e7;
+    const endTime = ((offset + duration) * 1.0) / 1e7;
+
+    audio.current.currentTime = startTime;
+
+    // Add timeupdate listener to stop at the end of the segment
+    const handleTimeUpdate = () => {
+      if (audio.current.currentTime >= endTime) {
+        audio.current.pause();
+        audio.current.removeEventListener("timeupdate", handleTimeUpdate);
+      }
+    };
+
+    audio.current.addEventListener("timeupdate", handleTimeUpdate);
    audio.current.play();
  };

  useEffect(() => {
    if (!audio.current) {
-      audio.current = new Audio();
+      audio.current = new Audio(props.src);
    }

    return () => {
-      audio.current?.pause();
-      delete audio.current;
+      if (audio.current) {
+        audio.current.pause();
+        audio.current.removeEventListener("timeupdate", () => {});
+        audio.current = null;
+      }
    };
-  }, []);
+  }, [props.src]);

  return (
    <Popover>
@@ -152,11 +170,20 @@ export const PronunciationAssessmentWordResult = (props: {
          </div>
        )}

-        <div className="">
+        <div className="flex items-center space-x-2">
+          <span className="text-sm">{t("myPronunciation")}:</span>
          <Button onClick={play} variant="ghost" size="icon">
            <Volume2Icon className="w-5 h-5" />
          </Button>
        </div>
+        {onPlayOrigin && (
+          <div className="flex items-center space-x-2">
+            <span className="text-sm">{t("originalPronunciation")}:</span>
+            <Button onClick={onPlayOrigin} variant="ghost" size="icon">
+              <Volume2Icon className="w-5 h-5" />
+            </Button>
+          </div>
+        )}
      </PopoverContent>
    </Popover>
  );
--- a/enjoy/src/renderer/components/recordings/recording-detail.tsx
+++ b/enjoy/src/renderer/components/recordings/recording-detail.tsx
@@ -14,8 +14,9 @@ export const RecordingDetail = (props: {
  recording: RecordingType;
  pronunciationAssessment?: PronunciationAssessmentType;
  onAssess?: (assessment: PronunciationAssessmentType) => void;
+  onPlayOrigin?: (word: string) => void;
 }) => {
-  const { recording, onAssess } = props;
+  const { recording, onAssess, onPlayOrigin } = props;
  if (!recording) return;

  const [pronunciationAssessment, setPronunciationAssessment] =
@@ -40,7 +41,7 @@ export const RecordingDetail = (props: {
    setAssessing(true);
    createAssessment({
      recording,
-      reference: recording.referenceText || "",
+      reference: recording.referenceText?.replace(/[—]/g, ", ") || "",
      language: recording.language || learningLanguage,
    })
      .then((assessment) => {
@@ -76,6 +77,7 @@ export const RecordingDetail = (props: {
          words={result.words}
          currentTime={currentTime}
          src={recording.src}
+          onPlayOrigin={onPlayOrigin}
        />
      ) : (
        <ScrollArea className="min-h-72 py-4 px-8 select-text">
--- a/enjoy/src/renderer/context/media-shadow-provider.tsx
+++ b/enjoy/src/renderer/context/media-shadow-provider.tsx
@@ -1,4 +1,4 @@
-import { createContext, useEffect, useState, useContext } from "react";
+import { createContext, useEffect, useState, useContext, useMemo } from "react";
 import { convertIpaToNormal, extractFrequencies } from "@/utils";
 import { AppSettingsProviderContext } from "@renderer/context";
 import {
@@ -12,7 +12,10 @@ import Regions, {
  type Region as RegionType,
 } from "wavesurfer.js/dist/plugins/regions";
 import Chart from "chart.js/auto";
-import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
+import {
+  Timeline,
+  TimelineEntry,
+} from "echogarden/dist/utilities/Timeline.d.js";
 import { toast } from "@renderer/components/ui";
 import { Tooltip } from "react-tooltip";
 import { useAudioRecorder } from "react-audio-voice-recorder";
@@ -48,6 +51,7 @@ type MediaShadowContextType = {
  regions: Regions | null;
  activeRegion: RegionType;
  setActiveRegion: (region: RegionType) => void;
+  toggleRegion: (params: number[]) => void;
  renderPitchContour: (
    region: RegionType,
    options?: {
@@ -74,6 +78,7 @@ type MediaShadowContextType = {
  transcribingOutput: string;
  transcriptionDraft: TranscriptionType["result"];
  setTranscriptionDraft: (result: TranscriptionType["result"]) => void;
+  caption: TimelineEntry;
  // Recordings
  startRecording: () => void;
  stopRecording: () => void;
@@ -180,6 +185,10 @@ export const MediaShadowProvider = ({
    toast.error(exception.message);
  });

+  const caption = useMemo(() => {
+    return (transcription?.result?.timeline as Timeline)?.[currentSegmentIndex];
+  }, [currentSegmentIndex, transcription]);
+
  const { segment, createSegment } = useSegments({
    targetId: media?.id,
    targetType: media?.mediaType,
@@ -466,6 +475,67 @@ export const MediaShadowProvider = ({
      );
  };

+  const toggleRegion = (params: number[]) => {
+    if (!activeRegion) return;
+    if (editingRegion) {
+      toast.warning(t("currentRegionIsBeingEdited"));
+      return;
+    }
+    if (params.length === 0) {
+      if (activeRegion.id.startsWith("word-region")) {
+        activeRegion.remove();
+        setActiveRegion(
+          regions.getRegions().find((r) => r.id.startsWith("segment-region"))
+        );
+      }
+      return;
+    }
+
+    const startIndex = Math.min(...params);
+    const endIndex = Math.max(...params);
+
+    const startWord = caption.timeline[startIndex];
+    if (!startWord) return;
+
+    const endWord = caption.timeline[endIndex] || startWord;
+
+    const start = startWord.startTime;
+    const end = endWord.endTime;
+
+    // If the active region is a word region, then merge the selected words into a single region.
+    if (activeRegion.id.startsWith("word-region")) {
+      activeRegion.remove();
+
+      const region = regions.addRegion({
+        id: `word-region-${startIndex}`,
+        start,
+        end,
+        color: "#fb6f9233",
+        drag: false,
+        resize: editingRegion,
+      });
+
+      setActiveRegion(region);
+      // If the active region is a meaning group region, then active the segment region.
+    } else if (activeRegion.id.startsWith("meaning-group-region")) {
+      setActiveRegion(
+        regions.getRegions().find((r) => r.id.startsWith("segment-region"))
+      );
+      // If the active region is a segment region, then create a new word region.
+    } else {
+      const region = regions.addRegion({
+        id: `word-region-${startIndex}`,
+        start,
+        end,
+        color: "#fb6f9233",
+        drag: false,
+        resize: false,
+      });
+
+      setActiveRegion(region);
+    }
+  };
+
  /*
   * When wavesurfer is decoded,
   * set up event listeners for wavesurfer
@@ -667,6 +737,7 @@ export const MediaShadowProvider = ({
          pitchChart,
          activeRegion,
          setActiveRegion,
+          toggleRegion,
          renderPitchContour,
          editingRegion,
          setEditingRegion,
@@ -676,6 +747,7 @@ export const MediaShadowProvider = ({
          transcribingOutput,
          transcriptionDraft,
          setTranscriptionDraft,
+          caption,
          startRecording,
          stopRecording,
          cancelRecording,