Fix download script (#317)

* bundle tiny.en as whisper default model

* improve download-whisper-model script

* improve download-ffmpeg-wasm script
This commit is contained in:
an-lee
2024-02-18 10:56:52 +08:00
committed by GitHub
parent da09134e37
commit fdc3c80d33
4 changed files with 81 additions and 42 deletions

View File

@@ -35,9 +35,7 @@ await Promise.all(
console.info(chalk.green(`✅ File ${file.name} valid`));
} else {
console.warn(
chalk.yellow(
`❌ File ${file.name} not valid, start to redownload`
)
chalk.yellow(`❌ File ${file.name} not valid, start to redownload`)
);
fs.removeSync(path.join(dir, file.name));
pendingFiles.push(file);
@@ -81,6 +79,8 @@ if (proxyUrl) {
}
const download = async (url, dest, md5) => {
console.info(chalk.blue(`=> Start to download ${url} to ${dest}`));
return spinner(async () => {
console.info(chalk.blue(`=> Start to download file ${url}`));
await axios
@@ -89,22 +89,27 @@ const download = async (url, dest, md5) => {
})
.then(async (response) => {
const data = Buffer.from(response.data, "binary");
console.info(chalk.green(`${dest} downloaded successfully`));
fs.writeFileSync(dest, data);
const hash = await hashFile(dest, { algo: "md5" });
if (hash === md5) {
console.info(chalk.green(`${dest} downloaded successfully`));
console.info(chalk.green(`${dest} valid`));
} else {
console.error(
chalk.red(
`❌ Error: ${dest} MD5 not match, ${hash} should be ${md5}`
`❌ Error: ${dest} not valid. \nPlease try again using the command "yarn workspace enjoy download-ffmpeg-wasm"`
)
);
process.exit(1);
}
})
.catch((err) => {
console.error(chalk.red(`❌ Error: ${err}`));
console.error(
chalk.red(
`❌ Failed to download(${err}). \nPlease try again using the command "yarn workspace enjoy download-ffmpeg-wasm"`
)
);
process.exit(1);
});
});
@@ -126,12 +131,17 @@ const cleanup = () => {
try {
fs.removeSync(path.join(dir, file.name));
} catch (err) {
console.error(chalk.red(`❌ Error: ${err}`));
console.error(
chalk.red(
`❌ Failed to download(${err}). \nPlease try again using the command "yarn workspace enjoy download-ffmpeg-wasm"`
)
);
}
});
};
const baseURL = "https://unpkg.com/@ffmpeg/core-mt@0.12.6/dist/esm";
// const baseURL = "https://unpkg.com/@ffmpeg/core-mt@0.12.6/dist/esm";
const baseURL = "https://enjoy-storage.baizhiheizi.com";
try {
await Promise.all(
pendingFiles.map((file) =>
@@ -139,7 +149,11 @@ try {
)
);
} catch (err) {
console.error(chalk.red(`❌ Error: ${err}`));
console.error(
chalk.red(
`❌ Failed to download(${err}). \nPlease try again using the command "yarn workspace enjoy download-ffmpeg-wasm"`
)
);
cleanup();
process.exit(1);
}

View File

@@ -4,8 +4,8 @@ import axios from "axios";
import progress from "progress";
import { createHash } from "crypto";
const model = "ggml-base.en-q5_1.bin";
const md5 = "55309cc6613788f07ac7988985210734";
const model = "ggml-tiny.en.bin";
const sha = "c78c86eb1a8faa21b369bcd33207cc90d64ae9df";
const dir = path.join(process.cwd(), "lib/whisper.cpp/models");
@@ -15,8 +15,8 @@ fs.ensureDirSync(dir);
try {
if (fs.statSync(path.join(dir, model)).isFile()) {
console.info(chalk.green(`✅ Model ${model} already exists`));
const hash = await hashFile(path.join(dir, model), { algo: "md5" });
if (hash === md5) {
const hash = await hashFile(path.join(dir, model), { algo: "sha1" });
if (hash === sha) {
console.info(chalk.green(`✅ Model ${model} valid`));
process.exit(0);
} else {
@@ -50,11 +50,12 @@ if (proxyUrl) {
};
}
const modelUrlPrefix =
"https://huggingface.co/ggerganov/whisper.cpp/resolve/main";
// const modelUrlPrefix =
// "https://huggingface.co/ggerganov/whisper.cpp/resolve/main";
const modelUrlPrefix = "https://enjoy-storage.baizhiheizi.com";
function hashFile(path, options) {
const algo = options.algo || "md5";
const algo = options.algo || "sha1";
return new Promise((resolve, reject) => {
const hash = createHash(algo);
const stream = fs.createReadStream(path);
@@ -65,6 +66,7 @@ function hashFile(path, options) {
}
const download = async (url, dest) => {
console.info(chalk.blue(`=> Start to download from ${url} to ${dest}`));
return axios
.get(url, { responseType: "stream" })
.then((response) => {
@@ -82,13 +84,28 @@ const download = async (url, dest) => {
progressBar.tick(chunk.length);
});
response.data.pipe(fs.createWriteStream(dest)).on("close", () => {
response.data.pipe(fs.createWriteStream(dest)).on("close", async () => {
console.info(chalk.green(`✅ Model ${model} downloaded successfully`));
process.exit(0);
const hash = await hashFile(path.join(dir, model), { algo: "sha1" });
if (hash === sha) {
console.info(chalk.green(`✅ Model ${model} valid`));
process.exit(0);
} else {
console.error(
chalk.red(
`❌ Model ${model} not valid, please try again using command \`yarn workspace enjoy download-whisper-model\``
)
);
process.exit(1);
}
});
})
.catch((err) => {
console.error(chalk.red(`❌ Error: ${err}`));
console.error(
chalk.red(
`❌ Failed to download ${url}: ${err}.\nPlease try again using command \`yarn workspace enjoy download-whisper-model\``
)
);
process.exit(1);
});
};

View File

@@ -12,31 +12,36 @@ export const WHISPER_MODELS_OPTIONS = [
{
type: "tiny",
name: "ggml-tiny.en.bin",
size: "77.7 MB",
size: "75 MB",
sha: "c78c86eb1a8faa21b369bcd33207cc90d64ae9df",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-tiny.en.bin",
},
{
type: "base",
name: "ggml-base.en.bin",
size: "148 MB",
size: "142 MB",
sha: "137c40403d78fd54d454da0f9bd998f78703390c",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin",
},
{
type: "small",
name: "ggml-small.en.bin",
size: "488 MB",
size: "466 MB",
sha: "db8a495a91d927739e50b3fc1cc4c6b8f6c2d022",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-small.en.bin",
},
{
type: "medium",
name: "ggml-medium.en.bin",
size: "1.53 GB",
size: "1.5 GB",
sha: "8c30f0e44ce9560643ebd10bbe50cd20eafd3723",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-medium.en.bin",
},
{
type: "large",
name: "ggml-large-v3.bin",
size: "3.09 GB",
size: "2.9 GB",
sha: "ad82bf6a9043ceed055076d0fd39f5f186ff8062",
url: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-large-v3.bin",
},
];

View File

@@ -10,7 +10,7 @@ const logger = log.scope("whisper");
class Whipser {
private binMain: string;
private defaultModel: string;
private bundledModelsDir: string;
public config: WhisperConfigType;
constructor(config?: WhisperConfigType) {
@@ -20,13 +20,7 @@ class Whipser {
"whisper",
"main"
);
this.defaultModel = path.join(
__dirname,
"lib",
"whisper",
"models",
"ggml-base.en-q5_1.bin"
);
this.bundledModelsDir = path.join(__dirname, "lib", "whisper", "models");
if (fs.existsSync(customWhisperPath)) {
this.binMain = customWhisperPath;
} else {
@@ -36,23 +30,32 @@ class Whipser {
currentModel() {
if (!this.config.availableModels) return;
if (!this.config.model) {
const model = this.config.availableModels[0];
settings.setSync("whisper.model", this.config.availableModels[0].name);
return model.savePath;
let model: WhisperConfigType["availableModels"][0];
if (this.config.model) {
model = (this.config.availableModels || []).find(
(m) => m.name === this.config.model
);
}
if (!model) {
model = this.config.availableModels[0];
}
return (this.config.availableModels || []).find(
(m) => m.name === this.config.model
)?.savePath;
settings.setSync("whisper.model", model.name);
return model.savePath;
}
async initialize() {
const bundleModels = fs.readdirSync(this.bundledModelsDir);
const dir = path.join(settings.libraryPath(), "whisper", "models");
fs.ensureDirSync(dir);
const files = fs.readdirSync(dir);
const availableModelFiles = bundleModels.concat(files);
const models = [];
for (const file of files) {
for (const file of availableModelFiles) {
const model = WHISPER_MODELS_OPTIONS.find((m) => m.name == file);
if (!model) continue;
@@ -102,7 +105,7 @@ class Whipser {
async check() {
await this.initialize();
const model = this.currentModel() || this.defaultModel;
const model = this.currentModel();
const sampleFile = path.join(__dirname, "samples", "jfk.wav");
const tmpDir = settings.cachePath();
@@ -169,7 +172,7 @@ class Whipser {
throw new Error("No file or blob provided");
}
const model = this.currentModel() || this.defaultModel;
const model = this.currentModel();
if (blob) {
const format = blob.type.split("/")[1];