Improve alignment (#886)
* refactor wordTimeline to sentenceTimeline * fix twinkle * fix azure ai * refactor * clean code * refactor cloudflare stt
This commit is contained in:
@@ -10,7 +10,11 @@ import {
|
||||
trimAudioEnd,
|
||||
AudioSourceParam,
|
||||
} from "echogarden/dist/audio/AudioUtilities.js";
|
||||
import { Timeline } from "echogarden/dist/utilities/Timeline.d.js";
|
||||
import { wordTimelineToSegmentSentenceTimeline } from "echogarden/dist/utilities/Timeline.js";
|
||||
import {
|
||||
type Timeline,
|
||||
type TimelineEntry,
|
||||
} from "echogarden/dist/utilities/Timeline.d.js";
|
||||
import path from "path";
|
||||
import log from "@main/logger";
|
||||
import url from "url";
|
||||
@@ -43,6 +47,7 @@ class EchogardenWrapper {
|
||||
public getRawAudioDuration: typeof getRawAudioDuration;
|
||||
public trimAudioStart: typeof trimAudioStart;
|
||||
public trimAudioEnd: typeof trimAudioEnd;
|
||||
public wordTimelineToSegmentSentenceTimeline: typeof wordTimelineToSegmentSentenceTimeline;
|
||||
|
||||
constructor() {
|
||||
this.align = Echogarden.align;
|
||||
@@ -54,6 +59,8 @@ class EchogardenWrapper {
|
||||
this.getRawAudioDuration = getRawAudioDuration;
|
||||
this.trimAudioStart = trimAudioStart;
|
||||
this.trimAudioEnd = trimAudioEnd;
|
||||
this.wordTimelineToSegmentSentenceTimeline =
|
||||
wordTimelineToSegmentSentenceTimeline;
|
||||
}
|
||||
|
||||
async check() {
|
||||
@@ -132,6 +139,37 @@ class EchogardenWrapper {
|
||||
}
|
||||
);
|
||||
|
||||
ipcMain.handle(
|
||||
"echogarden-word-to-sentence-timeline",
|
||||
async (
|
||||
_event,
|
||||
wordTimeline: Timeline,
|
||||
transcript: string,
|
||||
language: string
|
||||
) => {
|
||||
logger.debug("echogarden-word-to-sentence-timeline:", transcript);
|
||||
|
||||
const { segmentTimeline } =
|
||||
await this.wordTimelineToSegmentSentenceTimeline(
|
||||
wordTimeline,
|
||||
transcript,
|
||||
language.split("-")[0]
|
||||
);
|
||||
const timeline: Timeline = [];
|
||||
segmentTimeline.forEach((t: TimelineEntry) => {
|
||||
if (t.type === "sentence") {
|
||||
timeline.push(t);
|
||||
} else {
|
||||
t.timeline.forEach((st) => {
|
||||
timeline.push(st);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return timeline;
|
||||
}
|
||||
);
|
||||
|
||||
ipcMain.handle(
|
||||
"echogarden-transcode",
|
||||
async (_event, url: string, sampleRate?: number) => {
|
||||
|
||||
@@ -105,8 +105,6 @@ class Whipser {
|
||||
`--model "${model.savePath}"`,
|
||||
"--output-json",
|
||||
`--output-file "${path.join(tmpDir, "jfk")}"`,
|
||||
`--split-on-word true`,
|
||||
`--max-len 1`,
|
||||
];
|
||||
logger.debug(`Checking whisper command: ${commands.join(" ")}`);
|
||||
exec(
|
||||
@@ -205,9 +203,9 @@ class Whipser {
|
||||
"--print-progress",
|
||||
"--language",
|
||||
model.name.includes("en") ? "en" : language?.split("-")?.[0] || "auto",
|
||||
`--split-on-word`,
|
||||
`--max-len`,
|
||||
"1",
|
||||
// `--split-on-word`,
|
||||
// `--max-len`,
|
||||
// "1",
|
||||
...extra,
|
||||
];
|
||||
|
||||
|
||||
@@ -441,7 +441,24 @@ contextBridge.exposeInMainWorld("__ENJOY_APP__", {
|
||||
return ipcRenderer.invoke("echogarden-align", input, transcript, options);
|
||||
},
|
||||
alignSegments: (input: string, timeline: Timeline, options: any) => {
|
||||
return ipcRenderer.invoke("echogarden-align-segments", input, timeline, options);
|
||||
return ipcRenderer.invoke(
|
||||
"echogarden-align-segments",
|
||||
input,
|
||||
timeline,
|
||||
options
|
||||
);
|
||||
},
|
||||
wordToSentenceTimeline: (
|
||||
wordTimeline: Timeline,
|
||||
transcript: string,
|
||||
language: string
|
||||
) => {
|
||||
return ipcRenderer.invoke(
|
||||
"echogarden-word-to-sentence-timeline",
|
||||
wordTimeline,
|
||||
transcript,
|
||||
language
|
||||
);
|
||||
},
|
||||
transcode: (input: string) => {
|
||||
return ipcRenderer.invoke("echogarden-transcode", input);
|
||||
|
||||
@@ -51,11 +51,13 @@ export const MediaProvider = () => {
|
||||
language: transcription.result.language,
|
||||
})
|
||||
);
|
||||
}, [player, transcription]);
|
||||
|
||||
useEffect(() => {
|
||||
return () => {
|
||||
setMediaProvider(null);
|
||||
};
|
||||
}, [player, transcription]);
|
||||
}, [media?.src]);
|
||||
|
||||
if (!media?.src) return null;
|
||||
|
||||
|
||||
@@ -559,7 +559,7 @@ export const MediaPlayerProvider = ({
|
||||
setDecoded(false);
|
||||
setDecodeError(null);
|
||||
};
|
||||
}, [media?.src, ref, mediaProvider, layout?.playerHeight]);
|
||||
}, [media?.src, ref?.current, mediaProvider, layout?.playerHeight]);
|
||||
|
||||
/* cache last segment index */
|
||||
useEffect(() => {
|
||||
|
||||
@@ -19,82 +19,10 @@ import take from "lodash/take";
|
||||
import sortedUniqBy from "lodash/sortedUniqBy";
|
||||
import { parseText } from "media-captions";
|
||||
|
||||
/*
|
||||
* define the regex pattern to match the end of a sentence
|
||||
* the end of a sentence is defined as a period, question mark, or exclamation mark
|
||||
* also it may be followed by a quotation mark
|
||||
* and exclude sepecial cases like "Mr.", "Mrs.", "Dr.", "Ms.", "etc."
|
||||
*/
|
||||
const sentenceEndPattern = /(?<!Mr|Mrs|Dr|Ms|etc)\.|\?|!\"?/;
|
||||
|
||||
// test a text string has any punctuations or not
|
||||
// some transcribed text may not have any punctuations
|
||||
const punctuationsPattern = /\w[.,!?](\s|$)/g;
|
||||
|
||||
/*
|
||||
* convert the word timeline to sentence timeline
|
||||
* a sentence is a group of words that ends with a punctuation
|
||||
*/
|
||||
const wordTimelineToSentenceTimeline = (
|
||||
wordTimeline: TimelineEntry[]
|
||||
): TimelineEntry[] => {
|
||||
const timeline: TimelineEntry[] = [];
|
||||
|
||||
wordTimeline.forEach((word, index) => {
|
||||
word.text = word.text.trim();
|
||||
// skip empty words
|
||||
if (!word.text) return;
|
||||
// skip music or sound effects quoted in []
|
||||
if (word.text.match(/^\[.*\]$/)) return;
|
||||
|
||||
const wordEntry = {
|
||||
type: "word" as TimelineEntryType,
|
||||
text: word.text,
|
||||
startTime: word.startTime,
|
||||
endTime: word.endTime,
|
||||
};
|
||||
|
||||
let sentence: TimelineEntry;
|
||||
// get the last sentence in the timeline
|
||||
if (timeline.length > 0) {
|
||||
sentence = timeline[timeline.length - 1];
|
||||
}
|
||||
|
||||
// if there is no sentence in the timeline, create a new sentence
|
||||
// if last sentence is a punctuation, create a new sentence
|
||||
if (!sentence || sentence.text.match(sentenceEndPattern)) {
|
||||
sentence = {
|
||||
type: "sentence" as TimelineEntryType,
|
||||
text: "",
|
||||
startTime: wordEntry.startTime,
|
||||
endTime: wordEntry.endTime,
|
||||
timeline: [],
|
||||
};
|
||||
timeline.push(sentence);
|
||||
}
|
||||
|
||||
// if the word is a punctuation, add it to the sentence and start a new sentence
|
||||
if (wordEntry.text.match(sentenceEndPattern)) {
|
||||
sentence.text += wordEntry.text;
|
||||
sentence.endTime = wordEntry.endTime;
|
||||
|
||||
const lastSentence = timeline[timeline.length - 1];
|
||||
if (lastSentence.endTime !== sentence.endTime) {
|
||||
timeline.push(sentence);
|
||||
}
|
||||
} else {
|
||||
sentence.text += wordEntry.text + " ";
|
||||
sentence.endTime = wordEntry.endTime;
|
||||
|
||||
if (index === wordTimeline.length - 1) {
|
||||
timeline.push(sentence);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
return timeline;
|
||||
};
|
||||
|
||||
export const useTranscribe = () => {
|
||||
const { EnjoyApp, user, webApi } = useContext(AppSettingsProviderContext);
|
||||
const { openai } = useContext(AISettingsProviderContext);
|
||||
@@ -208,28 +136,11 @@ export const useTranscribe = () => {
|
||||
isolate,
|
||||
}
|
||||
);
|
||||
|
||||
wordTimeline.forEach((word: TimelineEntry) => {
|
||||
let sentence = timeline.find(
|
||||
(entry) =>
|
||||
word.startTime >= entry.startTime && word.endTime <= entry.endTime
|
||||
);
|
||||
|
||||
if (sentence) {
|
||||
sentence.timeline.push(word);
|
||||
}
|
||||
});
|
||||
|
||||
/*
|
||||
* the start time of a sentence should be the start time of the first word in the sentence
|
||||
* the end time of a sentence should the end time of the last word in the sentence
|
||||
*/
|
||||
// timeline.forEach((t) => {
|
||||
// if (t.timeline.length === 0) return;
|
||||
|
||||
// t.startTime = t.timeline[0].startTime;
|
||||
// t.endTime = t.timeline[t.timeline.length - 1].endTime;
|
||||
// });
|
||||
timeline = await EnjoyApp.echogarden.wordToSentenceTimeline(
|
||||
wordTimeline,
|
||||
transcript,
|
||||
language.split("-")[0]
|
||||
);
|
||||
} else {
|
||||
// Remove all content inside `()`, `[]`, `{}` and trim the text
|
||||
// remove all markdown formatting
|
||||
@@ -299,20 +210,34 @@ export const useTranscribe = () => {
|
||||
}
|
||||
);
|
||||
|
||||
const wordTimeline: TimelineEntry[] = res.transcription.map((word) => {
|
||||
return {
|
||||
type: "word" as TimelineEntryType,
|
||||
text: word.text,
|
||||
startTime: word.offsets.from / 1000.0,
|
||||
endTime: word.offsets.to / 1000.0,
|
||||
};
|
||||
});
|
||||
const timeline = wordTimelineToSentenceTimeline(wordTimeline);
|
||||
const timeline: TimelineEntry[] = res.transcription
|
||||
.map((segment) => {
|
||||
// ignore the word if it is empty or in the format of `[xxx]` or `(xxx)`
|
||||
if (
|
||||
!segment.text.trim() ||
|
||||
segment.text.trim().match(/^[\[\(].+[\]\)]$/)
|
||||
) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return {
|
||||
type: "segment" as TimelineEntryType,
|
||||
text: segment.text.trim(),
|
||||
startTime: segment.offsets.from / 1000.0,
|
||||
endTime: segment.offsets.to / 1000.0,
|
||||
};
|
||||
})
|
||||
.filter((s) => Boolean(s?.text));
|
||||
|
||||
const transcript = timeline
|
||||
.map((segment) => segment.text)
|
||||
.join(" ")
|
||||
.trim();
|
||||
|
||||
return {
|
||||
engine: "whisper",
|
||||
model: res.model.type,
|
||||
text: res.transcription.map((segment) => segment.text).join(" "),
|
||||
text: transcript,
|
||||
timeline,
|
||||
};
|
||||
};
|
||||
@@ -337,14 +262,14 @@ export const useTranscribe = () => {
|
||||
file,
|
||||
model: "whisper-1",
|
||||
response_format: "verbose_json",
|
||||
timestamp_granularities: ["word"],
|
||||
timestamp_granularities: ["segment"],
|
||||
})) as any;
|
||||
|
||||
let timeline: TimelineEntry[] = [];
|
||||
if (res.segments) {
|
||||
res.segments.forEach((segment) => {
|
||||
const segmentTimeline = {
|
||||
type: "sentence" as TimelineEntryType,
|
||||
type: "segment" as TimelineEntryType,
|
||||
text: segment.text,
|
||||
startTime: segment.start,
|
||||
endTime: segment.end,
|
||||
@@ -353,16 +278,6 @@ export const useTranscribe = () => {
|
||||
|
||||
timeline.push(segmentTimeline);
|
||||
});
|
||||
} else if (res.words) {
|
||||
const wordTimeline = res.words.map((word) => {
|
||||
return {
|
||||
type: "word" as TimelineEntryType,
|
||||
text: word.word,
|
||||
startTime: word.start,
|
||||
endTime: word.end,
|
||||
};
|
||||
});
|
||||
timeline = wordTimelineToSentenceTimeline(wordTimeline);
|
||||
}
|
||||
|
||||
return {
|
||||
@@ -390,15 +305,16 @@ export const useTranscribe = () => {
|
||||
})
|
||||
).data;
|
||||
|
||||
const wordTimeline = res.words.map((word) => {
|
||||
const caption = await parseText(res.vtt, { type: "vtt" });
|
||||
const timeline: Timeline = caption.cues.map((cue) => {
|
||||
return {
|
||||
type: "word" as TimelineEntryType,
|
||||
text: word.word,
|
||||
startTime: word.start,
|
||||
endTime: word.end,
|
||||
type: "segment",
|
||||
text: cue.text,
|
||||
startTime: cue.startTime,
|
||||
endTime: cue.endTime,
|
||||
timeline: [],
|
||||
};
|
||||
});
|
||||
const timeline = wordTimelineToSentenceTimeline(wordTimeline);
|
||||
|
||||
return {
|
||||
engine: "cloudflare",
|
||||
@@ -435,7 +351,13 @@ export const useTranscribe = () => {
|
||||
|
||||
let results: SpeechRecognitionResultType[] = [];
|
||||
|
||||
return new Promise((resolve, reject) => {
|
||||
const res: {
|
||||
engine: string;
|
||||
model: string;
|
||||
text: string;
|
||||
tokenId: number;
|
||||
timeline?: TimelineEntry[];
|
||||
} = await new Promise((resolve, reject) => {
|
||||
reco.recognizing = (_s, e) => {
|
||||
setOutput(e.result.text);
|
||||
};
|
||||
@@ -454,44 +376,41 @@ export const useTranscribe = () => {
|
||||
}
|
||||
|
||||
reco.stopContinuousRecognitionAsync();
|
||||
console.log("CANCELED: Reason=" + e.reason);
|
||||
};
|
||||
|
||||
reco.sessionStopped = (_s, _e) => {
|
||||
reco.sessionStopped = async (_s, e) => {
|
||||
console.log(
|
||||
"Session stopped. Stop continuous recognition.",
|
||||
e.sessionId,
|
||||
results
|
||||
);
|
||||
reco.stopContinuousRecognitionAsync();
|
||||
|
||||
const wordTimeline: TimelineEntry[] = [];
|
||||
const transcript = results
|
||||
.map((result) => result.DisplayText)
|
||||
.join(" ")
|
||||
.trim();
|
||||
|
||||
const timeline: Timeline = [];
|
||||
results.forEach((result) => {
|
||||
const best = take(sortedUniqBy(result.NBest, "Confidence"), 1)[0];
|
||||
const splitedWords = best.Display.trim().split(" ");
|
||||
const firstWord = best.Words[0];
|
||||
const lastWord = best.Words[best.Words.length - 1];
|
||||
|
||||
best.Words.forEach((word, index) => {
|
||||
let text = word.Word;
|
||||
if (splitedWords.length === best.Words.length) {
|
||||
text = splitedWords[index];
|
||||
}
|
||||
|
||||
if (
|
||||
index === best.Words.length - 1 &&
|
||||
!text.trim().match(sentenceEndPattern)
|
||||
) {
|
||||
text = text + ".";
|
||||
}
|
||||
|
||||
wordTimeline.push({
|
||||
type: "word" as TimelineEntryType,
|
||||
text,
|
||||
startTime: word.Offset / 10000000.0,
|
||||
endTime: (word.Offset + word.Duration) / 10000000.0,
|
||||
});
|
||||
timeline.push({
|
||||
type: "sentence",
|
||||
text: best.Display,
|
||||
startTime: firstWord.Offset / 10000000.0,
|
||||
endTime: (lastWord.Offset + lastWord.Duration) / 10000000.0,
|
||||
timeline: [],
|
||||
});
|
||||
});
|
||||
|
||||
const timeline = wordTimelineToSentenceTimeline(wordTimeline);
|
||||
|
||||
resolve({
|
||||
engine: "azure",
|
||||
model: "whisper",
|
||||
text: results.map((result) => result.DisplayText).join(" "),
|
||||
text: transcript,
|
||||
timeline,
|
||||
tokenId: id,
|
||||
});
|
||||
@@ -499,6 +418,8 @@ export const useTranscribe = () => {
|
||||
|
||||
reco.startContinuousRecognitionAsync();
|
||||
});
|
||||
|
||||
return res;
|
||||
};
|
||||
|
||||
return {
|
||||
|
||||
@@ -11,7 +11,7 @@ import { MAGIC_TOKEN_REGEX, END_OF_SENTENCE_REGEX } from "@/constants";
|
||||
|
||||
export const useTranscriptions = (media: AudioType | VideoType) => {
|
||||
const { whisperConfig } = useContext(AISettingsProviderContext);
|
||||
const { EnjoyApp, webApi, learningLanguage } = useContext(
|
||||
const { EnjoyApp, learningLanguage } = useContext(
|
||||
AppSettingsProviderContext
|
||||
);
|
||||
const { addDblistener, removeDbListener } = useContext(DbProviderContext);
|
||||
|
||||
5
enjoy/src/types/enjoy-app.d.ts
vendored
5
enjoy/src/types/enjoy-app.d.ts
vendored
@@ -257,6 +257,11 @@ type EnjoyAppType = {
|
||||
timeline: Timeline,
|
||||
options?: any
|
||||
) => Promise<Timeline>;
|
||||
wordToSentenceTimeline: (
|
||||
wordTimeline: Timeline,
|
||||
transcript: string,
|
||||
language: string
|
||||
) => Promise<Timeline>;
|
||||
transcode: (input: string) => Promise<string>;
|
||||
check: () => Promise<boolean>;
|
||||
};
|
||||
|
||||
1
enjoy/src/types/index.d.ts
vendored
1
enjoy/src/types/index.d.ts
vendored
@@ -77,6 +77,7 @@ type WhisperOutputType = {
|
||||
|
||||
type CfWhipserOutputType = {
|
||||
text: string;
|
||||
vtt: string;
|
||||
words_count: number;
|
||||
words: {
|
||||
word: string;
|
||||
|
||||
@@ -27,6 +27,7 @@ export default defineConfig((env) => {
|
||||
...external,
|
||||
"echogarden/dist/api/API.js",
|
||||
"echogarden/dist/audio/AudioUtilities.js",
|
||||
"echogarden/dist/utilities/Timeline.js",
|
||||
],
|
||||
output: {
|
||||
strict: false,
|
||||
|
||||
Reference in New Issue
Block a user