Native Embedder model selection (incl: Multilingual support) (#3835)

* WIP on embedder selection
TODO: apply splitting and query prefixes (if applicable)

* wip on upsert

* Support base model
support nomic-text-embed-v1
support multilingual-e5-small
Add prefixing for both embedding and query for RAG tasks
Add chunking prefix to all vector dbs to apply prefix when possible
Show dropdown and auto-pull on new selection

* norm translations

* move supported models to constants
handle null seelction or invalid selection on dropdown
update comments

* dev

* patch text splitter maximums for now

* normalize translations

* add tests for splitter functionality

* normalize

---------

Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
This commit is contained in:
Timothy Carambat
2025-07-22 10:07:20 -07:00
committed by GitHub
parent 31a8ead823
commit 2c19dd09ed
44 changed files with 463 additions and 80 deletions

View File

@@ -0,0 +1,63 @@
const SUPPORTED_NATIVE_EMBEDDING_MODELS = {
"Xenova/all-MiniLM-L6-v2": {
maxConcurrentChunks: 25,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 1,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 512, (from the model card)
embeddingMaxChunkLength: 1_000,
chunkPrefix: "",
queryPrefix: "",
apiInfo: {
id: "Xenova/all-MiniLM-L6-v2",
name: "all-MiniLM-L6-v2",
description:
"A lightweight and fast model for embedding text. The default model for AnythingLLM.",
lang: "English",
size: "23MB",
modelCard: "https://huggingface.co/Xenova/all-MiniLM-L6-v2",
},
},
"Xenova/nomic-embed-text-v1": {
maxConcurrentChunks: 5,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 16,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 8192, (from the model card)
embeddingMaxChunkLength: 16_000,
chunkPrefix: "search_document: ",
queryPrefix: "search_query: ",
apiInfo: {
id: "Xenova/nomic-embed-text-v1",
name: "nomic-embed-text-v1",
description:
"A high-performing open embedding model with a large token context window. Requires more processing power and memory.",
lang: "English",
size: "139MB",
modelCard: "https://huggingface.co/Xenova/nomic-embed-text-v1",
},
},
"MintplexLabs/multilingual-e5-small": {
maxConcurrentChunks: 5,
// Right now, this is NOT the token length, and is instead the number of characters
// that can be processed in a single pass. So we override to 1,000 characters.
// roughtly the max number of tokens assuming 2 characters per token. (undershooting)
// embeddingMaxChunkLength: 512, (from the model card)
embeddingMaxChunkLength: 1_000,
chunkPrefix: "passage: ",
queryPrefix: "query: ",
apiInfo: {
id: "MintplexLabs/multilingual-e5-small",
name: "multilingual-e5-small",
description:
"A larger multilingual embedding model that supports 100+ languages. Requires more processing power and memory.",
lang: "100+ languages",
size: "487MB",
modelCard: "https://huggingface.co/intfloat/multilingual-e5-small",
},
},
};
module.exports = {
SUPPORTED_NATIVE_EMBEDDING_MODELS,
};

View File

@@ -2,37 +2,114 @@ const path = require("path");
const fs = require("fs");
const { toChunks } = require("../../helpers");
const { v4 } = require("uuid");
const { SUPPORTED_NATIVE_EMBEDDING_MODELS } = require("./constants");
class NativeEmbedder {
static defaultModel = "Xenova/all-MiniLM-L6-v2";
/**
* Supported embedding models for native.
* @type {Record<string, {
* chunkPrefix: string;
* queryPrefix: string;
* apiInfo: {
* id: string;
* name: string;
* description: string;
* lang: string;
* size: string;
* modelCard: string;
* };
* }>}
*/
static supportedModels = SUPPORTED_NATIVE_EMBEDDING_MODELS;
// This is a folder that Mintplex Labs hosts for those who cannot capture the HF model download
// endpoint for various reasons. This endpoint is not guaranteed to be active or maintained
// and may go offline at any time at Mintplex Labs's discretion.
#fallbackHost = "https://cdn.anythingllm.com/support/models/";
constructor() {
// Model Card: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
this.model = "Xenova/all-MiniLM-L6-v2";
this.model = this.getEmbeddingModel();
this.modelInfo = this.getEmbedderInfo();
this.cacheDir = path.resolve(
process.env.STORAGE_DIR
? path.resolve(process.env.STORAGE_DIR, `models`)
: path.resolve(__dirname, `../../../storage/models`)
);
this.modelPath = path.resolve(this.cacheDir, "Xenova", "all-MiniLM-L6-v2");
this.modelPath = path.resolve(this.cacheDir, ...this.model.split("/"));
this.modelDownloaded = fs.existsSync(this.modelPath);
// Limit of how many strings we can process in a single pass to stay with resource or network limits
this.maxConcurrentChunks = 25;
this.embeddingMaxChunkLength = 1_000;
this.maxConcurrentChunks = this.modelInfo.maxConcurrentChunks;
this.embeddingMaxChunkLength = this.modelInfo.embeddingMaxChunkLength;
// Make directory when it does not exist in existing installations
if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir);
this.log("Initialized");
this.log(`Initialized ${this.model}`);
}
log(text, ...args) {
console.log(`\x1b[36m[NativeEmbedder]\x1b[0m ${text}`, ...args);
}
/**
* Get the selected model from the environment variable.
* @returns {string}
*/
static _getEmbeddingModel() {
const envModel =
process.env.EMBEDDING_MODEL_PREF ?? NativeEmbedder.defaultModel;
if (NativeEmbedder.supportedModels?.[envModel]) return envModel;
return NativeEmbedder.defaultModel;
}
get embeddingPrefix() {
return NativeEmbedder.supportedModels[this.model]?.chunkPrefix || "";
}
get queryPrefix() {
return NativeEmbedder.supportedModels[this.model]?.queryPrefix || "";
}
/**
* Get the available models in an API response format
* we can use to populate the frontend dropdown.
* @returns {{id: string, name: string, description: string, lang: string, size: string, modelCard: string}[]}
*/
static availableModels() {
return Object.values(NativeEmbedder.supportedModels).map(
(model) => model.apiInfo
);
}
/**
* Get the embedding model to use.
* We only support a few models and will default to the default model if the environment variable is not set or not supported.
*
* Why only a few? Because we need to mirror them on the CDN so non-US users can download them.
* eg: "Xenova/all-MiniLM-L6-v2"
* eg: "Xenova/nomic-embed-text-v1"
* @returns {string}
*/
getEmbeddingModel() {
const envModel =
process.env.EMBEDDING_MODEL_PREF ?? NativeEmbedder.defaultModel;
if (NativeEmbedder.supportedModels?.[envModel]) return envModel;
return NativeEmbedder.defaultModel;
}
/**
* Get the embedding model info.
*
* Will always fallback to the default model if the model is not supported.
* @returns {Object}
*/
getEmbedderInfo() {
const model = this.getEmbeddingModel();
return NativeEmbedder.supportedModels[model];
}
#tempfilePath() {
const filename = `${v4()}.tmp`;
const tmpPath = process.env.STORAGE_DIR
@@ -124,7 +201,27 @@ class NativeEmbedder {
throw fetchResponse.error;
}
/**
* Apply the query prefix to the text input if it is required by the model.
* eg: nomic-embed-text-v1 requires a query prefix for embedding/searching.
* @param {string|string[]} textInput - The text to embed.
* @returns {string|string[]} The text with the prefix applied.
*/
#applyQueryPrefix(textInput) {
if (!this.queryPrefix) return textInput;
if (Array.isArray(textInput))
textInput = textInput.map((text) => `${this.queryPrefix}${text}`);
else textInput = `${this.queryPrefix}${textInput}`;
return textInput;
}
/**
* Embed a single text input.
* @param {string|string[]} textInput - The text to embed.
* @returns {Promise<Array<number>>} The embedded text.
*/
async embedTextInput(textInput) {
textInput = this.#applyQueryPrefix(textInput);
const result = await this.embedChunks(
Array.isArray(textInput) ? textInput : [textInput]
);