Force alignment for tts audio (#418)

* add originalText as param * save original text when added from tts speech * fix player in conversation sheet * minor fix
2024-03-18 12:55:28 +08:00
parent 94dfabfec8
commit de89ae73bf
10 changed files with 123 additions and 61 deletions
--- a/enjoy/src/main/db/handlers/audios-handler.ts
+++ b/enjoy/src/main/db/handlers/audios-handler.ts
@@ -72,6 +72,7 @@ class AudiosHandler {
    params: {
      name?: string;
      coverUrl?: string;
+      originalText?: string;
    } = {}
  ) {
    let file = uri;
@@ -95,19 +96,33 @@ class AudiosHandler {
      }
    }

-    return Audio.buildFromLocalFile(file, {
-      source,
-      ...params,
-    })
-      .then((audio) => {
-        return audio.toJSON();
-      })
-      .catch((err) => {
-        return event.sender.send("on-notification", {
-          type: "error",
-          message: t("models.audio.failedToAdd", { error: err.message }),
-        });
+    try {
+      const audio = await Audio.buildFromLocalFile(file, {
+        source,
+        name: params.name,
+        coverUrl: params.coverUrl,
      });
+
+      // create transcription if originalText is provided
+      const { originalText } = params;
+      if (originalText) {
+        await Transcription.create({
+          targetType: "Audio",
+          targetId: audio.id,
+          targetMd5: audio.md5,
+          result: {
+            originalText,
+          },
+        });
+      }
+
+      return audio.toJSON();
+    } catch (err) {
+      return event.sender.send("on-notification", {
+        type: "error",
+        message: t("models.audio.failedToAdd", { error: err.message }),
+      });
+    }
  }

  private async update(
--- a/enjoy/src/main/db/models/audio.ts
+++ b/enjoy/src/main/db/models/audio.ts
@@ -229,6 +229,12 @@ export class Audio extends Model<Audio> {
        targetType: "Audio",
      },
    });
+    Transcription.destroy({
+      where: {
+        targetId: audio.id,
+        targetType: "Audio",
+      },
+    });

    const webApi = new Client({
      baseUrl: process.env.WEB_API_URL || WEB_API_URL,
--- a/enjoy/src/renderer/components/audios/audio-player.tsx
+++ b/enjoy/src/renderer/components/audios/audio-player.tsx
@@ -26,6 +26,10 @@ export const AudioPlayer = (props: { id?: string; md5?: string }) => {

  useEffect(() => {
    setRef(ref);
+
+    return () => {
+      setRef(null);
+    };
  }, [ref]);

  return (
--- a/enjoy/src/renderer/components/medias/media-current-recording.tsx
+++ b/enjoy/src/renderer/components/medias/media-current-recording.tsx
@@ -72,6 +72,7 @@ export const MediaCurrentRecording = (props: { height?: number }) => {

  const removeComparingPitchContour = () => {
    if (!wavesurfer) return;
+    if (!regions) return;

    regions
      .getRegions()
--- a/enjoy/src/renderer/components/medias/media-transcription.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription.tsx
@@ -72,7 +72,7 @@ export const MediaTranscription = () => {
      } as ScrollIntoViewOptions);
  }, [currentSegmentIndex, transcription, containerRef]);

-  if (!transcription?.result) {
+  if (!transcription?.result?.timeline) {
    return null;
  }

--- a/enjoy/src/renderer/components/messages/assistant-message.tsx
+++ b/enjoy/src/renderer/components/messages/assistant-message.tsx
@@ -104,6 +104,7 @@ export const AssistantMessageComponent = (props: {
          speech.text.length > 20
            ? speech.text.substring(0, 17).trim() + "..."
            : speech.text,
+        originalText: speech.text,
      });
      setResourcing(false);
    }
@@ -251,7 +252,7 @@ export const AssistantMessageComponent = (props: {
            </SheetClose>
          </SheetHeader>

-          {Boolean(speech) && <AudioPlayer md5={speech.md5} />}
+          {Boolean(speech) && shadowing && <AudioPlayer md5={speech.md5} />}
        </SheetContent>
      </Sheet>
    </div>
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -117,7 +117,7 @@ export const MediaPlayerProvider = ({
  const initializeWavesurfer = async () => {
    if (!media) return;
    if (!mediaProvider) return;
-    if (!ref.current) return;
+    if (!ref?.current) return;

    const ws = WaveSurfer.create({
      container: ref.current,
@@ -299,22 +299,6 @@ export const MediaPlayerProvider = ({
    );
  };

-  useEffect(() => {
-    if (!media) return;
-
-    EnjoyApp.waveforms.find(media.md5).then((waveform) => {
-      setWaveForm(waveform);
-    });
-  }, [media]);
-
-  /*
-   * Initialize wavesurfer when container ref is available
-   * and mediaProvider is available
-   */
-  useEffect(() => {
-    initializeWavesurfer();
-  }, [media, ref, mediaProvider]);
-
  /*
   * When wavesurfer is decoded,
   * set up event listeners for wavesurfer
@@ -353,6 +337,7 @@ export const MediaPlayerProvider = ({

    return () => {
      subscriptions.forEach((unsub) => unsub());
+      wavesurfer?.destroy();
    };
  }, [wavesurfer]);

@@ -372,6 +357,10 @@ export const MediaPlayerProvider = ({
    } else if (activeRegion.id.startsWith("word-region")) {
      setFitZoomRatio(containerWidth / 3 / duration / minPxPerSec);
    }
+
+    return () => {
+      setFitZoomRatio(1.0);
+    }
  }, [ref, wavesurfer, activeRegion]);

  /*
@@ -395,7 +384,7 @@ export const MediaPlayerProvider = ({
    if (!activeRegion) return;

    renderPitchContour(activeRegion);
-  }, [activeRegion]);
+  }, [wavesurfer, activeRegion]);

  /*
   * Update player styles
@@ -408,6 +397,22 @@ export const MediaPlayerProvider = ({
    scrollContainer.style.scrollbarWidth = "thin";
  }, [decoded, wavesurfer]);

+  useEffect(() => {
+    if (!media) return;
+
+    EnjoyApp.waveforms.find(media.md5).then((waveform) => {
+      setWaveForm(waveform);
+    });
+  }, [media]);
+
+  /*
+   * Initialize wavesurfer when container ref is available
+   * and mediaProvider is available
+   */
+  useEffect(() => {
+    initializeWavesurfer();
+  }, [media, ref, mediaProvider]);
+
  return (
    <MediaPlayerProviderContext.Provider
      value={{
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -79,34 +79,43 @@ export const useTranscribe = () => {
    params?: {
      targetId?: string;
      targetType?: string;
+      originalText?: string;
    }
  ): Promise<{
    engine: string;
    model: string;
    alignmentResult: AlignmentResult;
+    originalText?: string;
  }> => {
    const blob = await transcode(mediaSrc);
+    const { targetId, targetType, originalText } = params || {};

    let result;
-    if (whisperConfig.service === "local") {
+    if (originalText) {
+      result = {
+        engine: "original",
+        model: "original",
+      };
+    } else if (whisperConfig.service === "local") {
      result = await transcribeByLocal(blob);
    } else if (whisperConfig.service === "cloudflare") {
      result = await transcribeByCloudflareAi(blob);
    } else if (whisperConfig.service === "openai") {
      result = await transcribeByOpenAi(blob);
    } else if (whisperConfig.service === "azure") {
-      result = await transcribeByAzureAi(blob, params);
+      result = await transcribeByAzureAi(blob, { targetId, targetType });
    } else {
      throw new Error(t("whisperServiceNotSupported"));
    }

    const alignmentResult = await EnjoyApp.echogarden.align(
      new Uint8Array(await blob.arrayBuffer()),
-      result.result.map((segment) => segment.text).join(" ")
+      originalText || result.result.map((segment) => segment.text).join(" ")
    );

    return {
      ...result,
+      originalText,
      alignmentResult,
    };
  };
--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -29,30 +29,41 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
      setTranscription(record);
    }
  };
-  const findOrCreateTranscription = async () => {
-    if (!media) return;
-    if (transcription) return;
+  const findOrCreateTranscription =
+    async (): Promise<TranscriptionType | void> => {
+      if (!media) return;
+      if (transcription?.targetId === media.id) return;

-    return EnjoyApp.transcriptions
-      .findOrCreate({
-        targetId: media.id,
-        targetType: media.mediaType,
-      })
-      .then((t) => {
-        if (t.result && !t.result["transcript"]) {
-          t.result = null;
-        }
-        setTranscription(t);
-      })
-      .catch((err) => {
-        toast.error(err.message);
-      });
-  };
+      return EnjoyApp.transcriptions
+        .findOrCreate({
+          targetId: media.id,
+          targetType: media.mediaType,
+        })
+        .then((t) => {
+          if (t.result && !t.result["timeline"]) {
+            t.result = {
+              originalText: t.result?.originalText,
+            };
+          }
+          setTranscription(t);
+          return t;
+        })
+        .catch((err) => {
+          toast.error(err.message);
+        });
+    };

  const generateTranscription = async () => {
-    if (transcribing) return;
-    if (!transcription) {
-      await findOrCreateTranscription();
+    if (transcription?.targetId === media.id) return;
+
+    let originalText: string;
+    if (transcription) {
+      originalText = transcription.result?.originalText;
+    } else {
+      const r = await findOrCreateTranscription();
+      if (r) {
+        originalText = r.result?.originalText;
+      }
    }

    setTranscribing(true);
@@ -61,6 +72,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
      const { engine, model, alignmentResult } = await transcribe(media.src, {
        targetId: media.id,
        targetType: media.mediaType,
+        originalText,
      });

      let timeline: TimelineEntry[] = [];
@@ -105,6 +117,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
        result: {
          timeline: timeline,
          transcript: alignmentResult.transcript,
+          originalText,
        },
        engine,
        model,
@@ -126,14 +139,16 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
    });

    const transcript = (res?.transcriptions || []).filter((t) =>
-      ["base", "small", "medium", "large", "whisper-1"].includes(t.model)
+      ["base", "small", "medium", "large", "whisper-1", "original"].includes(
+        t.model
+      )
    )?.[0];

    if (!transcript) {
      return Promise.reject("Transcription not found");
    }

-    if (!transcript.result["transcript"]) {
+    if (!transcript.result["timeline"]) {
      return Promise.reject("Transcription not aligned");
    }

@@ -149,17 +164,23 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
    try {
      await findTranscriptionFromWebApi();
    } catch (err) {
-      console.error(err);
+      console.warn(err);
      await generateTranscription();
    }
  };

+  /*
+   * find or create transcription
+   */
  useEffect(() => {
    if (!media) return;

    findOrCreateTranscription();
  }, [media]);

+  /*
+   * auto-generate transcription result
+   */
  useEffect(() => {
    if (!transcription) return;

@@ -167,7 +188,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {

    if (
      transcription.state == "pending" ||
-      !transcription.result?.["transcript"]
+      !transcription.result?.["timeline"]
    ) {
      findOrGenerateTranscription();
    }
--- a/enjoy/src/types/transcription.d.ts
+++ b/enjoy/src/types/transcription.d.ts
@@ -5,7 +5,7 @@ type TranscriptionType = {
  state: "pending" | "processing" | "finished";
  engine: string;
  model: string;
-  result: AlignmentResult;
+  result: AlignmentResult & { original?: string };
 };

 type TranscriptionResultSegmentType = {