Fix: handle echogarden align error (#620)

* may re-generate when "no matching voice found" * may regenerate if the original text has not-matching voice * specify language to avoid "no matching voice found"
2024-05-20 10:30:28 +08:00
parent e0b2f59a23
commit cdbaf89005
10 changed files with 997 additions and 464 deletions
--- a/enjoy/package.json
+++ b/enjoy/package.json
@@ -129,7 +129,7 @@
    "dayjs": "^1.11.11",
    "decamelize": "^6.0.0",
    "decamelize-keys": "^2.0.1",
-    "echogarden": "^1.4.3",
+    "echogarden": "^1.4.4",
    "electron-context-menu": "^4.0.0",
    "electron-log": "^5.1.4",
    "electron-settings": "^4.0.4",
--- a/enjoy/src/main/echogarden.ts
+++ b/enjoy/src/main/echogarden.ts
@@ -100,6 +100,7 @@ class EchogardenWrapper {
        transcript: string,
        options: AlignmentOptions
      ) => {
+        logger.debug("echogarden-align:", transcript, options);
        try {
          return await this.align(input, transcript, options);
        } catch (err) {
--- a/enjoy/src/renderer/components/medias/media-loading-modal.tsx
+++ b/enjoy/src/renderer/components/medias/media-loading-modal.tsx
@@ -34,7 +34,7 @@ export const MediaLoadingModal = () => {

  return (
    <AlertDialog open={!decoded || !Boolean(transcription?.result?.timeline)}>
-      <AlertDialogOverlay className="z-[100]" />
+      <AlertDialogOverlay className="" />
      <AlertDialogContent className="z-[100]">
        <AlertDialogHeader>
          <AlertDialogTitle>{t("preparingAudio")}</AlertDialogTitle>
@@ -55,9 +55,7 @@ export const MediaLoadingModal = () => {
                <XCircleIcon className="w-4 h-4 text-destructive" />
              </div>
              <div className="select-text">
-                <div className="mb-2">
-                  {decodeError}
-                </div>
+                <div className="mb-2">{decodeError}</div>
                <div className="text-sm text-muted-foreground">
                  {t("failedToDecodeWaveform")}:{" "}
                  <span className="break-all ">{media?.src}</span>
@@ -97,12 +95,17 @@ export const MediaLoadingModal = () => {
              <div className="inline">
                <span>{t("notTranscribedYet")}</span>
                {decoded && (
-                  <Button
-                    onClick={generateTranscription}
-                    className="ml-4"
-                    size="sm"
-                  >
-                    {t("transcribe")}
+                  <Button asChild className="ml-4" size="sm">
+                    <a
+                      className="cursor-pointer"
+                      onClick={() =>
+                        generateTranscription({
+                          originalText: "",
+                        })
+                      }
+                    >
+                      {t("regenerate")}
+                    </a>
                  </Button>
                )}
              </div>
--- a/enjoy/src/renderer/components/medias/media-transcription-form.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription-form.tsx
@@ -57,7 +57,7 @@ export const TranscriptionForm = (props: {
  const handleSave = async () => {
    setSubmiting(true);
    try {
-      await generateTranscription(content);
+      await generateTranscription({ originalText: content });
      setOpen(false);
    } catch (e) {
      toast.error(e.message);
--- a/enjoy/src/renderer/components/medias/media-transcription.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription.tsx
@@ -144,7 +144,13 @@ export const MediaTranscription = () => {
                </AlertDialogHeader>
                <AlertDialogFooter>
                  <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
-                  <AlertDialogAction onClick={() => generateTranscription("")}>
+                  <AlertDialogAction
+                    onClick={() =>
+                      generateTranscription({
+                        originalText: "",
+                      })
+                    }
+                  >
                    {t("transcribe")}
                  </AlertDialogAction>
                </AlertDialogFooter>
--- a/enjoy/src/renderer/components/messages/assistant-message.tsx
+++ b/enjoy/src/renderer/components/messages/assistant-message.tsx
@@ -284,7 +284,11 @@ export const AssistantMessageComponent = (props: {
        </DropdownMenu>
      </div>

-      <Sheet open={shadowing} onOpenChange={(value) => setShadowing(value)}>
+      <Sheet
+        modal={false}
+        open={shadowing}
+        onOpenChange={(value) => setShadowing(value)}
+      >
        <SheetContent
          side="bottom"
          className="h-screen p-0"
--- a/enjoy/src/renderer/context/media-player-provider.tsx
+++ b/enjoy/src/renderer/context/media-player-provider.tsx
@@ -66,7 +66,10 @@ type MediaPlayerContextType = {
  pitchChart: Chart;
  // Transcription
  transcription: TranscriptionType;
-  generateTranscription: (text?: string) => void;
+  generateTranscription: (params?: {
+    originalText?: string;
+    language?: string;
+  }) => void;
  transcribing: boolean;
  transcribingProgress: number;
  transcriptionDraft: TranscriptionType["result"];
--- a/enjoy/src/renderer/hooks/use-transcribe.tsx
+++ b/enjoy/src/renderer/hooks/use-transcribe.tsx
@@ -34,6 +34,7 @@ export const useTranscribe = () => {
      targetId?: string;
      targetType?: string;
      originalText?: string;
+      language?: string;
    }
  ): Promise<{
    engine: string;
@@ -42,7 +43,12 @@ export const useTranscribe = () => {
    originalText?: string;
  }> => {
    const url = await transcode(mediaSrc);
-    const { targetId, targetType, originalText } = params || {};
+    const {
+      targetId,
+      targetType,
+      originalText,
+      language = "english",
+    } = params || {};
    const blob = await (await fetch(url)).blob();

    let result;
@@ -75,7 +81,10 @@ export const useTranscribe = () => {

    const alignmentResult = await EnjoyApp.echogarden.align(
      new Uint8Array(await blob.arrayBuffer()),
-      transcript
+      transcript,
+      {
+        language,
+      }
    );

    return {
@@ -197,7 +206,7 @@ export const useTranscribe = () => {
        resolve({
          engine: "azure",
          model: "whisper",
-          text: results.map((result) => result.DisplayText).join(' '),
+          text: results.map((result) => result.DisplayText).join(" "),
        });
      };

--- a/enjoy/src/renderer/hooks/use-transcriptions.tsx
+++ b/enjoy/src/renderer/hooks/use-transcriptions.tsx
@@ -8,7 +8,6 @@ import {
 import { toast } from "@renderer/components/ui";
 import { TimelineEntry } from "echogarden/dist/utilities/Timeline.d.js";
 import { MAGIC_TOKEN_REGEX, END_OF_SENTENCE_REGEX } from "@/constants";
-import { ca } from "@vidstack/react/types/vidstack-react";

 export const useTranscriptions = (media: AudioType | VideoType) => {
  const { whisperConfig } = useContext(AISettingsProviderContext);
@@ -53,7 +52,11 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
        });
    };

-  const generateTranscription = async (originalText?: string) => {
+  const generateTranscription = async (params?: {
+    originalText?: string;
+    language?: string;
+  }) => {
+    let { originalText, language } = params || {};
    if (originalText === undefined) {
      if (transcription?.targetId === media.id) {
        originalText = transcription.result?.originalText;
@@ -72,6 +75,7 @@ export const useTranscriptions = (media: AudioType | VideoType) => {
        targetId: media.id,
        targetType: media.mediaType,
        originalText,
+        language,
      });

      let timeline: TimelineEntry[] = [];
--- a/yarn.lock
+++ b/yarn.lock