mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2026-06-15 23:20:32 +03:00
Native Embedder model selection (incl: Multilingual support) (#3835)
* WIP on embedder selection TODO: apply splitting and query prefixes (if applicable) * wip on upsert * Support base model support nomic-text-embed-v1 support multilingual-e5-small Add prefixing for both embedding and query for RAG tasks Add chunking prefix to all vector dbs to apply prefix when possible Show dropdown and auto-pull on new selection * norm translations * move supported models to constants handle null seelction or invalid selection on dropdown update comments * dev * patch text splitter maximums for now * normalize translations * add tests for splitter functionality * normalize --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
This commit is contained in:
63
server/utils/EmbeddingEngines/native/constants.js
Normal file
63
server/utils/EmbeddingEngines/native/constants.js
Normal file
@@ -0,0 +1,63 @@
|
||||
const SUPPORTED_NATIVE_EMBEDDING_MODELS = {
|
||||
"Xenova/all-MiniLM-L6-v2": {
|
||||
maxConcurrentChunks: 25,
|
||||
// Right now, this is NOT the token length, and is instead the number of characters
|
||||
// that can be processed in a single pass. So we override to 1,000 characters.
|
||||
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
|
||||
// embeddingMaxChunkLength: 512, (from the model card)
|
||||
embeddingMaxChunkLength: 1_000,
|
||||
chunkPrefix: "",
|
||||
queryPrefix: "",
|
||||
apiInfo: {
|
||||
id: "Xenova/all-MiniLM-L6-v2",
|
||||
name: "all-MiniLM-L6-v2",
|
||||
description:
|
||||
"A lightweight and fast model for embedding text. The default model for AnythingLLM.",
|
||||
lang: "English",
|
||||
size: "23MB",
|
||||
modelCard: "https://huggingface.co/Xenova/all-MiniLM-L6-v2",
|
||||
},
|
||||
},
|
||||
"Xenova/nomic-embed-text-v1": {
|
||||
maxConcurrentChunks: 5,
|
||||
// Right now, this is NOT the token length, and is instead the number of characters
|
||||
// that can be processed in a single pass. So we override to 16,000 characters.
|
||||
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
|
||||
// embeddingMaxChunkLength: 8192, (from the model card)
|
||||
embeddingMaxChunkLength: 16_000,
|
||||
chunkPrefix: "search_document: ",
|
||||
queryPrefix: "search_query: ",
|
||||
apiInfo: {
|
||||
id: "Xenova/nomic-embed-text-v1",
|
||||
name: "nomic-embed-text-v1",
|
||||
description:
|
||||
"A high-performing open embedding model with a large token context window. Requires more processing power and memory.",
|
||||
lang: "English",
|
||||
size: "139MB",
|
||||
modelCard: "https://huggingface.co/Xenova/nomic-embed-text-v1",
|
||||
},
|
||||
},
|
||||
"MintplexLabs/multilingual-e5-small": {
|
||||
maxConcurrentChunks: 5,
|
||||
// Right now, this is NOT the token length, and is instead the number of characters
|
||||
// that can be processed in a single pass. So we override to 1,000 characters.
|
||||
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
|
||||
// embeddingMaxChunkLength: 512, (from the model card)
|
||||
embeddingMaxChunkLength: 1_000,
|
||||
chunkPrefix: "passage: ",
|
||||
queryPrefix: "query: ",
|
||||
apiInfo: {
|
||||
id: "MintplexLabs/multilingual-e5-small",
|
||||
name: "multilingual-e5-small",
|
||||
description:
|
||||
"A larger multilingual embedding model that supports 100+ languages. Requires more processing power and memory.",
|
||||
lang: "100+ languages",
|
||||
size: "487MB",
|
||||
modelCard: "https://huggingface.co/intfloat/multilingual-e5-small",
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
module.exports = {
|
||||
SUPPORTED_NATIVE_EMBEDDING_MODELS,
|
||||
};
|
||||
@@ -2,37 +2,114 @@ const path = require("path");
|
||||
const fs = require("fs");
|
||||
const { toChunks } = require("../../helpers");
|
||||
const { v4 } = require("uuid");
|
||||
const { SUPPORTED_NATIVE_EMBEDDING_MODELS } = require("./constants");
|
||||
|
||||
class NativeEmbedder {
|
||||
static defaultModel = "Xenova/all-MiniLM-L6-v2";
|
||||
|
||||
/**
|
||||
* Supported embedding models for native.
|
||||
* @type {Record<string, {
|
||||
* chunkPrefix: string;
|
||||
* queryPrefix: string;
|
||||
* apiInfo: {
|
||||
* id: string;
|
||||
* name: string;
|
||||
* description: string;
|
||||
* lang: string;
|
||||
* size: string;
|
||||
* modelCard: string;
|
||||
* };
|
||||
* }>}
|
||||
*/
|
||||
static supportedModels = SUPPORTED_NATIVE_EMBEDDING_MODELS;
|
||||
|
||||
// This is a folder that Mintplex Labs hosts for those who cannot capture the HF model download
|
||||
// endpoint for various reasons. This endpoint is not guaranteed to be active or maintained
|
||||
// and may go offline at any time at Mintplex Labs's discretion.
|
||||
#fallbackHost = "https://cdn.anythingllm.com/support/models/";
|
||||
|
||||
constructor() {
|
||||
// Model Card: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
|
||||
this.model = "Xenova/all-MiniLM-L6-v2";
|
||||
this.model = this.getEmbeddingModel();
|
||||
this.modelInfo = this.getEmbedderInfo();
|
||||
this.cacheDir = path.resolve(
|
||||
process.env.STORAGE_DIR
|
||||
? path.resolve(process.env.STORAGE_DIR, `models`)
|
||||
: path.resolve(__dirname, `../../../storage/models`)
|
||||
);
|
||||
this.modelPath = path.resolve(this.cacheDir, "Xenova", "all-MiniLM-L6-v2");
|
||||
this.modelPath = path.resolve(this.cacheDir, ...this.model.split("/"));
|
||||
this.modelDownloaded = fs.existsSync(this.modelPath);
|
||||
|
||||
// Limit of how many strings we can process in a single pass to stay with resource or network limits
|
||||
this.maxConcurrentChunks = 25;
|
||||
this.embeddingMaxChunkLength = 1_000;
|
||||
this.maxConcurrentChunks = this.modelInfo.maxConcurrentChunks;
|
||||
this.embeddingMaxChunkLength = this.modelInfo.embeddingMaxChunkLength;
|
||||
|
||||
// Make directory when it does not exist in existing installations
|
||||
if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir);
|
||||
this.log("Initialized");
|
||||
this.log(`Initialized ${this.model}`);
|
||||
}
|
||||
|
||||
log(text, ...args) {
|
||||
console.log(`\x1b[36m[NativeEmbedder]\x1b[0m ${text}`, ...args);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the selected model from the environment variable.
|
||||
* @returns {string}
|
||||
*/
|
||||
static _getEmbeddingModel() {
|
||||
const envModel =
|
||||
process.env.EMBEDDING_MODEL_PREF ?? NativeEmbedder.defaultModel;
|
||||
if (NativeEmbedder.supportedModels?.[envModel]) return envModel;
|
||||
return NativeEmbedder.defaultModel;
|
||||
}
|
||||
|
||||
get embeddingPrefix() {
|
||||
return NativeEmbedder.supportedModels[this.model]?.chunkPrefix || "";
|
||||
}
|
||||
|
||||
get queryPrefix() {
|
||||
return NativeEmbedder.supportedModels[this.model]?.queryPrefix || "";
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the available models in an API response format
|
||||
* we can use to populate the frontend dropdown.
|
||||
* @returns {{id: string, name: string, description: string, lang: string, size: string, modelCard: string}[]}
|
||||
*/
|
||||
static availableModels() {
|
||||
return Object.values(NativeEmbedder.supportedModels).map(
|
||||
(model) => model.apiInfo
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the embedding model to use.
|
||||
* We only support a few models and will default to the default model if the environment variable is not set or not supported.
|
||||
*
|
||||
* Why only a few? Because we need to mirror them on the CDN so non-US users can download them.
|
||||
* eg: "Xenova/all-MiniLM-L6-v2"
|
||||
* eg: "Xenova/nomic-embed-text-v1"
|
||||
* @returns {string}
|
||||
*/
|
||||
getEmbeddingModel() {
|
||||
const envModel =
|
||||
process.env.EMBEDDING_MODEL_PREF ?? NativeEmbedder.defaultModel;
|
||||
if (NativeEmbedder.supportedModels?.[envModel]) return envModel;
|
||||
return NativeEmbedder.defaultModel;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the embedding model info.
|
||||
*
|
||||
* Will always fallback to the default model if the model is not supported.
|
||||
* @returns {Object}
|
||||
*/
|
||||
getEmbedderInfo() {
|
||||
const model = this.getEmbeddingModel();
|
||||
return NativeEmbedder.supportedModels[model];
|
||||
}
|
||||
|
||||
#tempfilePath() {
|
||||
const filename = `${v4()}.tmp`;
|
||||
const tmpPath = process.env.STORAGE_DIR
|
||||
@@ -124,7 +201,27 @@ class NativeEmbedder {
|
||||
throw fetchResponse.error;
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply the query prefix to the text input if it is required by the model.
|
||||
* eg: nomic-embed-text-v1 requires a query prefix for embedding/searching.
|
||||
* @param {string|string[]} textInput - The text to embed.
|
||||
* @returns {string|string[]} The text with the prefix applied.
|
||||
*/
|
||||
#applyQueryPrefix(textInput) {
|
||||
if (!this.queryPrefix) return textInput;
|
||||
if (Array.isArray(textInput))
|
||||
textInput = textInput.map((text) => `${this.queryPrefix}${text}`);
|
||||
else textInput = `${this.queryPrefix}${textInput}`;
|
||||
return textInput;
|
||||
}
|
||||
|
||||
/**
|
||||
* Embed a single text input.
|
||||
* @param {string|string[]} textInput - The text to embed.
|
||||
* @returns {Promise<Array<number>>} The embedded text.
|
||||
*/
|
||||
async embedTextInput(textInput) {
|
||||
textInput = this.#applyQueryPrefix(textInput);
|
||||
const result = await this.embedChunks(
|
||||
Array.isArray(textInput) ? textInput : [textInput]
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user