Fix bugs (#153)

* escape space in command line * disable record button when no reference * notify when transcription working too long * fix release CI * fix UI & remove deprecated codes * clear zombie transcribe process when started * fix remove file when added a duplicated audio/video * update latest whisper for win32
2024-01-18 18:21:55 +08:00
parent 8f4503db37
commit f04dd1e3c8
20 changed files with 120 additions and 349 deletions
--- a/.github/workflows/release-enjoy-app.yml
+++ b/.github/workflows/release-enjoy-app.yml
@@ -21,4 +21,5 @@ jobs:
      - if: matrix.os == 'macos-latest'
        env:
          GITHUB_TOKEN: ${{ secrets.PUBLISH_TOKEN }}
+          PACKAGE_OS_ARCH: arm64
        run: yarn run publish:enjoy --arch=arm64
--- a/enjoy/lib/whisper.cpp/x64/win32/SDL2.dll
+++ b/enjoy/lib/whisper.cpp/x64/win32/SDL2.dll
--- a/enjoy/lib/whisper.cpp/x64/win32/bench.exe
+++ b/enjoy/lib/whisper.cpp/x64/win32/bench.exe
--- a/enjoy/lib/whisper.cpp/x64/win32/main.exe
+++ b/enjoy/lib/whisper.cpp/x64/win32/main.exe
--- a/enjoy/lib/whisper.cpp/x64/win32/quantize.exe
+++ b/enjoy/lib/whisper.cpp/x64/win32/quantize.exe
--- a/enjoy/lib/whisper.cpp/x64/win32/whisper.dll
+++ b/enjoy/lib/whisper.cpp/x64/win32/whisper.dll
--- a/enjoy/src/i18n/en.json
+++ b/enjoy/src/i18n/en.json
@@ -178,6 +178,7 @@
  "failedToLogin": "Failed to login",
  "invalidRedirectUrl": "Invalid redirect url",
  "transcribe": "Transcribe",
+  "stillTranscribing": "AI is still working on the transcription. Please wait or switch to a smaller model to make it faster.",
  "unableToSetLibraryPath": "Unable to set library path to {{path}}",
  "nthStep": "{{current}}/{{totalSteps}} Step",
  "open": "Open",
@@ -265,6 +266,7 @@
  "allResources": "all resources",
  "playbackRate": "playback rate",
  "transcription": "transcription",
+  "transcript": "transcript",
  "regenerate": "regenerate",
  "holdAndSpeak": "Hold and speak",
  "releaseToStop": "Release to stop",
--- a/enjoy/src/i18n/zh-CN.json
+++ b/enjoy/src/i18n/zh-CN.json
@@ -178,6 +178,7 @@
  "invalidRedirectUrl": "无效的重定向 URL",
  "delete": "删除",
  "transcribe": "语音转文本",
+  "stillTranscribing": "语音转文本仍在进行中，请耐心等候。或者您可以切换到另一个更小的模型以加快速度。",
  "unableToSetLibraryPath": "无法设置资源库保存路径 {{path}}",
  "nthStep": "第 {{current}}/{{totalSteps}} 步",
  "open": "打开",
@@ -265,6 +266,7 @@
  "allResources": "所有资源",
  "playbackRate": "播放速度",
  "transcription": "语音文本",
+  "transcript": "字幕",
  "regenerate": "重新生成",
  "holdAndSpeak": "按住并说话",
  "releaseToStop": "松开停止",
--- a/enjoy/src/main/db/handlers/audios-handler.ts
+++ b/enjoy/src/main/db/handlers/audios-handler.ts
@@ -80,12 +80,24 @@ class AudiosHandler {
      });
    }

-    audio.transcribe().catch((err) => {
+    const timeout = setTimeout(() => {
      event.sender.send("on-notification", {
-        type: "error",
-        message: err.message,
+        type: "warning",
+        message: t("stillTranscribing"),
+      });
+    }, 1000 * 10);
+
+    audio
+      .transcribe()
+      .catch((err) => {
+        event.sender.send("on-notification", {
+          type: "error",
+          message: err.message,
+        });
+      })
+      .finally(() => {
+        clearTimeout(timeout);
      });
-    });
  }

  private async create(
--- a/enjoy/src/main/db/handlers/transcriptions-handler.ts
+++ b/enjoy/src/main/db/handlers/transcriptions-handler.ts
@@ -1,6 +1,7 @@
 import { ipcMain, IpcMainEvent } from "electron";
 import { Transcription, Audio, Video } from "@main/db/models";
 import { WhereOptions, Attributes } from "sequelize";
+import { t } from "i18next";
 import log from "electron-log/main";

 const logger = log.scope("db/handlers/transcriptions-handler");
@@ -30,12 +31,24 @@ class TranscriptionsHandler {
      });

      if (transcription.state === "pending") {
-        transcription.process().catch((err) => {
+        const timeout = setTimeout(() => {
          event.sender.send("on-notification", {
-            type: "error",
-            message: err.message,
+            type: "warning",
+            message: t("stillTranscribing"),
+          });
+        }, 1000 * 10);
+
+        transcription
+          .process()
+          .catch((err) => {
+            event.sender.send("on-notification", {
+              type: "error",
+              message: err.message,
+            });
+          })
+          .finally(() => {
+            clearTimeout(timeout);
          });
-        });
      }

      return transcription.toJSON();
@@ -86,7 +99,24 @@ class TranscriptionsHandler {
          throw new Error("models.transcription.notFound");
        }

-        transcription.process({ force: true });
+        const timeout = setTimeout(() => {
+          event.sender.send("on-notification", {
+            type: "warning",
+            message: t("stillTranscribing"),
+          });
+        }, 1000 * 10);
+
+        transcription
+          .process({ force: true })
+          .catch((err) => {
+            event.sender.send("on-notification", {
+              type: "error",
+              message: err.message,
+            });
+          })
+          .finally(() => {
+            clearTimeout(timeout);
+          });
      })
      .catch((err) => {
        logger.error(err);
--- a/enjoy/src/main/db/handlers/videos-handler.ts
+++ b/enjoy/src/main/db/handlers/videos-handler.ts
@@ -80,12 +80,24 @@ class VideosHandler {
      });
    }

-    video.transcribe().catch((err) => {
+    const timeout = setTimeout(() => {
      event.sender.send("on-notification", {
-        type: "error",
-        message: err.message,
+        type: "warning",
+        message: t("stillTranscribing"),
+      });
+    }, 1000 * 10);
+
+    video
+      .transcribe()
+      .catch((err) => {
+        event.sender.send("on-notification", {
+          type: "error",
+          message: err.message,
+        });
+      })
+      .finally(() => {
+        clearTimeout(timeout);
      });
-    });
  }

  private async create(
--- a/enjoy/src/main/db/index.ts
+++ b/enjoy/src/main/db/index.ts
@@ -65,19 +65,20 @@ db.connect = async () => {
  await sequelize.sync();
  await sequelize.authenticate();

-  // TODO:
-  // clear the large waveform data in DB.
-  // Remove this in next release
-  const caches = await CacheObject.findAll({
-    attributes: ["id", "key"],
+  // kill the zombie transcribe processes
+  Transcription.findAll({
+    where: {
+      state: "processing",
+    },
+  }).then((transcriptions) => {
+    transcriptions.forEach((transcription) => {
+      if (transcription.result) {
+        transcription.update({ state: "finished" });
+      } else {
+        transcription.update({ state: "pending" });
+      }
+    });
  });
-  const cacheIds: string[] = [];
-  caches.forEach((cache) => {
-    if (cache.key.startsWith("waveform")) {
-      cacheIds.push(cache.id);
-    }
-  });
-  await CacheObject.destroy({ where: { id: cacheIds } });

  // vacuum the database
  await sequelize.query("VACUUM");
--- a/enjoy/src/main/db/models/audio.ts
+++ b/enjoy/src/main/db/models/audio.ts
@@ -257,6 +257,16 @@ export class Audio extends Model<Audio> {

    const md5 = await hashFile(filePath, { algo: "md5" });

+    // check if file already exists
+    const existing = await Audio.findOne({
+      where: {
+        md5,
+      },
+    });
+    if (existing) {
+      throw new Error(t("audioAlreadyAddedToLibrary", { file: filePath }));
+    }
+
    // Generate ID
    const userId = settings.getSync("user.id");
    const id = uuidv5(`${userId}/${md5}`, uuidv5.URL);
--- a/enjoy/src/main/db/models/video.ts
+++ b/enjoy/src/main/db/models/video.ts
@@ -279,6 +279,16 @@ export class Video extends Model<Video> {

    const md5 = await hashFile(filePath, { algo: "md5" });

+    // check if file already exists
+    const existing = await Video.findOne({
+      where: {
+        md5,
+      },
+    });
+    if (existing) {
+      throw new Error(t("videoAlreadyAddedToLibrary", { file: filePath }));
+    }
+
    // Generate ID
    const userId = settings.getSync("user.id");
    const id = uuidv5(`${userId}/${md5}`, uuidv5.URL);
--- a/enjoy/src/main/whisper.ts
+++ b/enjoy/src/main/whisper.ts
@@ -73,11 +73,11 @@ class Whipser {
    );

    const command = [
-      this.binMain,
+      `"${this.binMain}"`,
      `--file "${waveFile}"`,
-      `--model ${settings.whisperModelPath()}`,
+      `--model "${settings.whisperModelPath()}"`,
      "--output-json",
-      `--output-file ${path.join(tmpDir, filename)}`,
+      `--output-file "${path.join(tmpDir, filename)}"`,
      ...extra,
    ].join(" ");

--- a/enjoy/src/renderer/components/audios/audio-caption.tsx
+++ b/enjoy/src/renderer/components/audios/audio-caption.tsx
@@ -1,149 +0,0 @@
-import { useState, useEffect } from "react";
-import { cn } from "@renderer/lib/utils";
-import {
-  Button,
-  Popover,
-  PopoverContent,
-  PopoverAnchor,
-} from "@renderer/components/ui";
-import { LookupResult } from "@renderer/components";
-import { LanguagesIcon, PlayIcon } from "lucide-react";
-
-export const AudioCaption = (props: {
-  audioId: string;
-  currentTime: number;
-  transcription: TranscriptionGroupType;
-  onSeek?: (time: number) => void;
-  className?: string;
-  isPlaying: boolean;
-  setIsPlaying: (isPlaying: boolean) => void;
-}) => {
-  const {
-    transcription,
-    currentTime,
-    onSeek,
-    className,
-    isPlaying,
-    setIsPlaying,
-  } = props;
-  const [activeIndex, setActiveIndex] = useState<number>(0);
-  const [selected, setSelected] = useState<{
-    index: number;
-    word: string;
-    position?: {
-      top: number;
-      left: number;
-    };
-  }>();
-
-  useEffect(() => {
-    if (!transcription) return;
-    const time = Math.round(currentTime * 1000);
-    const index = transcription.segments.findIndex(
-      (w) => time >= w.offsets.from && time < w.offsets.to
-    );
-
-    if (index !== activeIndex) {
-      setActiveIndex(index);
-    }
-  }, [currentTime, transcription]);
-
-  if (!transcription) return null;
-  if (Math.round(currentTime * 1000) < transcription.offsets.from) return null;
-
-  return (
-    <div className={cn("relative px-4 py-2 text-lg", className)}>
-      <div className="flex flex-wrap">
-        {(transcription.segments || []).map((w, index) => (
-          <span
-            key={index}
-            className={`mr-1 cursor-pointer hover:bg-red-500/10 ${
-              index === activeIndex ? "text-red-500" : ""
-            }`}
-            onClick={(event) => {
-              setSelected({
-                index,
-                word: w.text,
-                position: {
-                  top:
-                    event.currentTarget.offsetTop +
-                    event.currentTarget.offsetHeight,
-                  left: event.currentTarget.offsetLeft,
-                },
-              });
-
-              setIsPlaying(false);
-              if (onSeek) onSeek(w.offsets.from / 1000);
-            }}
-          >
-            {w.text}
-          </span>
-        ))}
-
-        <Popover
-          open={Boolean(selected) && !isPlaying}
-          onOpenChange={(value) => {
-            if (!value) setSelected(null);
-          }}
-        >
-          <PopoverAnchor
-            className="absolute w-0 h-0"
-            style={{
-              top: selected?.position?.top,
-              left: selected?.position?.left,
-            }}
-          ></PopoverAnchor>
-          <PopoverContent
-            className="w-full max-w-md p-0"
-            updatePositionStrategy="always"
-          >
-            {selected?.word && (
-              <AudioCaptionSelectionMenu
-                word={selected.word}
-                context={transcription.segments.map((w) => w.text).join(" ").trim()}
-                audioId={props.audioId}
-                onPlay={() => {
-                  setIsPlaying(true);
-                }}
-              />
-            )}
-          </PopoverContent>
-        </Popover>
-      </div>
-    </div>
-  );
-};
-
-const AudioCaptionSelectionMenu = (props: {
-  word: string;
-  context: string;
-  audioId: string;
-  onPlay: () => void;
-}) => {
-  const { word, context, audioId, onPlay } = props;
-  const [translating, setTranslating] = useState<boolean>(false);
-
-  if (!word) return null;
-
-  if (translating) {
-    return (
-      <LookupResult
-        word={word}
-        context={context}
-        sourceId={audioId}
-        sourceType={"Audio"}
-      />
-    );
-  }
-
-  return (
-    <div className="flex items-center p-1">
-      <Button onClick={onPlay} variant="ghost" size="icon">
-        <PlayIcon size={16} />
-      </Button>
-      <Button onClick={() => setTranslating(true)} variant="ghost" size="icon">
-        <LanguagesIcon size={16} />
-      </Button>
-    </div>
-  );
-};
--- a/enjoy/src/renderer/components/audios/audio-transcription.tsx
+++ b/enjoy/src/renderer/components/audios/audio-transcription.tsx
@@ -1,165 +0,0 @@
-import {
-  AlertDialog,
-  AlertDialogTrigger,
-  AlertDialogFooter,
-  AlertDialogHeader,
-  AlertDialogContent,
-  AlertDialogTitle,
-  AlertDialogDescription,
-  AlertDialogCancel,
-  AlertDialogAction,
-  Skeleton,
-  ScrollArea,
-  Button,
-  PingPoint,
-} from "@renderer/components/ui";
-import React, { useEffect, useContext } from "react";
-import { t } from "i18next";
-import { LoaderIcon, CheckCircleIcon, MicIcon } from "lucide-react";
-import {
-  DbProviderContext,
-  AppSettingsProviderContext,
-} from "@renderer/context";
-
-export const AudioTranscription = (props: {
-  audio: AudioType | null;
-  currentSegmentIndex?: number;
-  onSelectSegment?: (index: number) => void;
-}) => {
-  const { addDblistener, removeDbListener } = useContext(DbProviderContext);
-  const { EnjoyApp } = useContext(AppSettingsProviderContext);
-  const { audio, currentSegmentIndex, onSelectSegment } = props;
-  const containerRef = React.createRef<HTMLDivElement>();
-
-  const [recordingStats, setRecordingStats] =
-    React.useState<SegementRecordingStatsType>([]);
-
-  const regenerate = async () => {
-    if (!audio) return;
-
-    EnjoyApp.audios.transcribe(audio.id);
-  };
-
-  const fetchSegmentStats = async () => {
-    if (!audio) return;
-
-    EnjoyApp.recordings.groupBySegment(audio.id).then((stats) => {
-      setRecordingStats(stats);
-    });
-  };
-
-  useEffect(() => {
-    addDblistener(fetchSegmentStats);
-    fetchSegmentStats();
-
-    return () => {
-      removeDbListener(fetchSegmentStats);
-    };
-  }, [audio]);
-
-  useEffect(() => {
-    containerRef.current
-      ?.querySelector(`#segment-${currentSegmentIndex}`)
-      ?.scrollIntoView({
-        block: "center",
-        inline: "center",
-      } as ScrollIntoViewOptions);
-  }, [currentSegmentIndex, audio?.transcription]);
-
-  if (!audio)
-    return (
-      <div className="p-4 w-full">
-        <TranscriptionPlaceholder />
-      </div>
-    );
-
-  return (
-    <div className="w-full h-full flex flex-col">
-      <div className="mb-4 flex items-cener justify-between">
-        <div className="flex items-center space-x-2">
-          {audio.transcribing ? (
-            <PingPoint colorClassName="bg-yellow-500" />
-          ) : audio.isTranscribed ? (
-            <CheckCircleIcon className="text-green-500 w-4 h-4" />
-          ) : (
-            <PingPoint colorClassName="bg-mute" />
-          )}
-          <span className="">{t("transcription")}</span>
-        </div>
-        <AlertDialog>
-          <AlertDialogTrigger asChild>
-            <Button disabled={audio.transcribing} className="capitalize">
-              {audio.transcribing && (
-                <LoaderIcon className="animate-spin w-4 mr-2" />
-              )}
-              {audio.isTranscribed ? t("regenerate") : t("transcribe")}
-            </Button>
-          </AlertDialogTrigger>
-          <AlertDialogContent>
-            <AlertDialogHeader>
-              <AlertDialogTitle>{t("transcribe")}</AlertDialogTitle>
-              <AlertDialogDescription>
-                {t("transcribeAudioConfirmation", {
-                  name: audio.name,
-                })}
-              </AlertDialogDescription>
-            </AlertDialogHeader>
-            <AlertDialogFooter>
-              <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
-              <AlertDialogAction
-                className="bg-destructive"
-                onClick={regenerate}
-              >
-                {t("transcribe")}
-              </AlertDialogAction>
-            </AlertDialogFooter>
-          </AlertDialogContent>
-        </AlertDialog>
-      </div>
-
-      {audio.transcription ? (
-        <ScrollArea ref={containerRef} className="flex-1">
-          {audio.transcription.map((t, index) => (
-            <div
-              key={index}
-              id={`segment-${index}`}
-              className={`py-1 px-2 mb-2 cursor-pointer hover:bg-yellow-400/25 ${
-                currentSegmentIndex === index ? "bg-yellow-400/25" : ""
-              }`}
-              onClick={() => {
-                onSelectSegment?.(index);
-              }}
-            >
-              <div className="flex items-center justify-between">
-                <span className="text-xs opacity-50">#{index + 1}</span>
-
-                <div className="flex items-center space-x-2">
-                  {(recordingStats || []).findIndex(
-                    (s) => s.segmentIndex === index
-                  ) !== -1 && <MicIcon className="w-3 h-3 text-sky-500" />}
-                  <span className="text-xs opacity-50">
-                    {t.timestamps.from.split(",")[0]}
-                  </span>
-                </div>
-              </div>
-              <p className="">{t.text}</p>
-            </div>
-          ))}
-        </ScrollArea>
-      ) : (
-        <TranscriptionPlaceholder />
-      )}
-    </div>
-  );
-};
-
-export const TranscriptionPlaceholder = () => {
-  return (
-    <div className="p-4">
-      {Array.from({ length: 5 }).map((_, i) => (
-        <Skeleton key={i} className="h-4 w-full mb-4" />
-      ))}
-      <Skeleton className="h-4 w-3/5" />
-    </div>
-  );
-};
--- a/enjoy/src/renderer/components/medias/media-transcription.tsx
+++ b/enjoy/src/renderer/components/medias/media-transcription.tsx
@@ -95,7 +95,7 @@ export const MediaTranscription = (props: {
          ) : (
            <PingPoint colorClassName="bg-mute" />
          )}
-          <span className="">{t("transcription")}</span>
+          <span className="capitalize">{t("transcript")}</span>
        </div>
        <AlertDialog>
          <AlertDialogTrigger asChild>
@@ -121,7 +121,6 @@ export const MediaTranscription = (props: {
            <AlertDialogFooter>
              <AlertDialogCancel>{t("cancel")}</AlertDialogCancel>
              <AlertDialogAction
-                className="bg-destructive"
                onClick={regenerate}
              >
                {t("transcribe")}
@@ -132,7 +131,7 @@ export const MediaTranscription = (props: {
      </div>

      {transcription?.result ? (
-        <ScrollArea ref={containerRef} className="flex-1">
+        <ScrollArea ref={containerRef} className="flex-1 px-2">
          {transcription.result.map((t, index) => (
            <div
              key={index}
--- a/enjoy/src/renderer/components/recordings/recordings-list.tsx
+++ b/enjoy/src/renderer/components/recordings/recordings-list.tsx
@@ -170,7 +170,7 @@ export const RecordingsList = (props: {

        <div className="z-50 bottom-16 left-1/2 w-0 h-0 absolute flex items-center justify-center">
          <RecordButton
-            disabled={!referenceId == undefined}
+            disabled={referenceId == undefined || !Boolean(referenceText)}
            onRecordEnd={createRecording}
          />
        </div>
--- a/enjoy/vite.main.config.mts
+++ b/enjoy/vite.main.config.mts
@@ -31,15 +31,21 @@ export default defineConfig({
    viteStaticCopy({
      targets: [
        {
-          src: `lib/whisper.cpp/${os.arch()}/${os.platform()}/*`,
+          src: `lib/whisper.cpp/${
+            process.env.PACKAGE_OS_ARCH || os.arch()
+          }/${os.platform()}/*`,
          dest: "lib/whisper",
        },
        {
-          src: `lib/youtubedr/${os.arch()}/${os.platform()}/*`,
+          src: `lib/youtubedr/${
+            process.env.PACKAGE_OS_ARCH || os.arch()
+          }/${os.platform()}/*`,
          dest: "lib/youtubedr",
        },
        {
-          src: `lib/ffmpeg//${os.arch()}/${os.platform()}/*`,
+          src: `lib/ffmpeg//${
+            process.env.PACKAGE_OS_ARCH || os.arch()
+          }/${os.platform()}/*`,
          dest: "lib/ffmpeg",
        },
        {