Native Embedder model selection (incl: Multilingual support) (#3835)

* WIP on embedder selection TODO: apply splitting and query prefixes (if applicable) * wip on upsert * Support base model support nomic-text-embed-v1 support multilingual-e5-small Add prefixing for both embedding and query for RAG tasks Add chunking prefix to all vector dbs to apply prefix when possible Show dropdown and auto-pull on new selection * norm translations * move supported models to constants handle null seelction or invalid selection on dropdown update comments * dev * patch text splitter maximums for now * normalize translations * add tests for splitter functionality * normalize --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
2026-06-15 23:20:32 +03:00 · 2025-07-22 10:07:20 -07:00
parent 31a8ead823
commit 2c19dd09ed
44 changed files with 463 additions and 80 deletions
--- a/server/utils/EmbeddingEngines/native/constants.js
+++ b/server/utils/EmbeddingEngines/native/constants.js
@@ -0,0 +1,63 @@
+const SUPPORTED_NATIVE_EMBEDDING_MODELS = {
+  "Xenova/all-MiniLM-L6-v2": {
+    maxConcurrentChunks: 25,
+    // Right now, this is NOT the token length, and is instead the number of characters
+    // that can be processed in a single pass. So we override to 1,000 characters.
+    // roughtly the max number of tokens assuming 2 characters per token. (undershooting)
+    // embeddingMaxChunkLength: 512, (from the model card)
+    embeddingMaxChunkLength: 1_000,
+    chunkPrefix: "",
+    queryPrefix: "",
+    apiInfo: {
+      id: "Xenova/all-MiniLM-L6-v2",
+      name: "all-MiniLM-L6-v2",
+      description:
+        "A lightweight and fast model for embedding text. The default model for AnythingLLM.",
+      lang: "English",
+      size: "23MB",
+      modelCard: "https://huggingface.co/Xenova/all-MiniLM-L6-v2",
+    },
+  },
+  "Xenova/nomic-embed-text-v1": {
+    maxConcurrentChunks: 5,
+    // Right now, this is NOT the token length, and is instead the number of characters
+    // that can be processed in a single pass. So we override to 16,000 characters.
+    // roughtly the max number of tokens assuming 2 characters per token. (undershooting)
+    // embeddingMaxChunkLength: 8192, (from the model card)
+    embeddingMaxChunkLength: 16_000,
+    chunkPrefix: "search_document: ",
+    queryPrefix: "search_query: ",
+    apiInfo: {
+      id: "Xenova/nomic-embed-text-v1",
+      name: "nomic-embed-text-v1",
+      description:
+        "A high-performing open embedding model with a large token context window. Requires more processing power and memory.",
+      lang: "English",
+      size: "139MB",
+      modelCard: "https://huggingface.co/Xenova/nomic-embed-text-v1",
+    },
+  },
+  "MintplexLabs/multilingual-e5-small": {
+    maxConcurrentChunks: 5,
+    // Right now, this is NOT the token length, and is instead the number of characters
+    // that can be processed in a single pass. So we override to 1,000 characters.
+    // roughtly the max number of tokens assuming 2 characters per token. (undershooting)
+    // embeddingMaxChunkLength: 512, (from the model card)
+    embeddingMaxChunkLength: 1_000,
+    chunkPrefix: "passage: ",
+    queryPrefix: "query: ",
+    apiInfo: {
+      id: "MintplexLabs/multilingual-e5-small",
+      name: "multilingual-e5-small",
+      description:
+        "A larger multilingual embedding model that supports 100+ languages. Requires more processing power and memory.",
+      lang: "100+ languages",
+      size: "487MB",
+      modelCard: "https://huggingface.co/intfloat/multilingual-e5-small",
+    },
+  },
+};
+
+module.exports = {
+  SUPPORTED_NATIVE_EMBEDDING_MODELS,
+};
--- a/server/utils/EmbeddingEngines/native/index.js
+++ b/server/utils/EmbeddingEngines/native/index.js
@@ -2,37 +2,114 @@ const path = require("path");
 const fs = require("fs");
 const { toChunks } = require("../../helpers");
 const { v4 } = require("uuid");
+const { SUPPORTED_NATIVE_EMBEDDING_MODELS } = require("./constants");

 class NativeEmbedder {
+  static defaultModel = "Xenova/all-MiniLM-L6-v2";
+
+  /**
+   * Supported embedding models for native.
+   * @type {Record<string, {
+   *   chunkPrefix: string;
+   *   queryPrefix: string;
+   *   apiInfo: {
+   *     id: string;
+   *     name: string;
+   *     description: string;
+   *     lang: string;
+   *     size: string;
+   *     modelCard: string;
+   *   };
+   * }>}
+   */
+  static supportedModels = SUPPORTED_NATIVE_EMBEDDING_MODELS;
+
  // This is a folder that Mintplex Labs hosts for those who cannot capture the HF model download
  // endpoint for various reasons. This endpoint is not guaranteed to be active or maintained
  // and may go offline at any time at Mintplex Labs's discretion.
  #fallbackHost = "https://cdn.anythingllm.com/support/models/";

  constructor() {
-    // Model Card: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
-    this.model = "Xenova/all-MiniLM-L6-v2";
+    this.model = this.getEmbeddingModel();
+    this.modelInfo = this.getEmbedderInfo();
    this.cacheDir = path.resolve(
      process.env.STORAGE_DIR
        ? path.resolve(process.env.STORAGE_DIR, `models`)
        : path.resolve(__dirname, `../../../storage/models`)
    );
-    this.modelPath = path.resolve(this.cacheDir, "Xenova", "all-MiniLM-L6-v2");
+    this.modelPath = path.resolve(this.cacheDir, ...this.model.split("/"));
    this.modelDownloaded = fs.existsSync(this.modelPath);

    // Limit of how many strings we can process in a single pass to stay with resource or network limits
-    this.maxConcurrentChunks = 25;
-    this.embeddingMaxChunkLength = 1_000;
+    this.maxConcurrentChunks = this.modelInfo.maxConcurrentChunks;
+    this.embeddingMaxChunkLength = this.modelInfo.embeddingMaxChunkLength;

    // Make directory when it does not exist in existing installations
    if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir);
-    this.log("Initialized");
+    this.log(`Initialized ${this.model}`);
  }

  log(text, ...args) {
    console.log(`\x1b[36m[NativeEmbedder]\x1b[0m ${text}`, ...args);
  }

+  /**
+   * Get the selected model from the environment variable.
+   * @returns {string}
+   */
+  static _getEmbeddingModel() {
+    const envModel =
+      process.env.EMBEDDING_MODEL_PREF ?? NativeEmbedder.defaultModel;
+    if (NativeEmbedder.supportedModels?.[envModel]) return envModel;
+    return NativeEmbedder.defaultModel;
+  }
+
+  get embeddingPrefix() {
+    return NativeEmbedder.supportedModels[this.model]?.chunkPrefix || "";
+  }
+
+  get queryPrefix() {
+    return NativeEmbedder.supportedModels[this.model]?.queryPrefix || "";
+  }
+
+  /**
+   * Get the available models in an API response format
+   * we can use to populate the frontend dropdown.
+   * @returns {{id: string, name: string, description: string, lang: string, size: string, modelCard: string}[]}
+   */
+  static availableModels() {
+    return Object.values(NativeEmbedder.supportedModels).map(
+      (model) => model.apiInfo
+    );
+  }
+
+  /**
+   * Get the embedding model to use.
+   * We only support a few models and will default to the default model if the environment variable is not set or not supported.
+   *
+   * Why only a few? Because we need to mirror them on the CDN so non-US users can download them.
+   * eg: "Xenova/all-MiniLM-L6-v2"
+   * eg: "Xenova/nomic-embed-text-v1"
+   * @returns {string}
+   */
+  getEmbeddingModel() {
+    const envModel =
+      process.env.EMBEDDING_MODEL_PREF ?? NativeEmbedder.defaultModel;
+    if (NativeEmbedder.supportedModels?.[envModel]) return envModel;
+    return NativeEmbedder.defaultModel;
+  }
+
+  /**
+   * Get the embedding model info.
+   *
+   * Will always fallback to the default model if the model is not supported.
+   * @returns {Object}
+   */
+  getEmbedderInfo() {
+    const model = this.getEmbeddingModel();
+    return NativeEmbedder.supportedModels[model];
+  }
+
  #tempfilePath() {
    const filename = `${v4()}.tmp`;
    const tmpPath = process.env.STORAGE_DIR
@@ -124,7 +201,27 @@ class NativeEmbedder {
    throw fetchResponse.error;
  }

+  /**
+   * Apply the query prefix to the text input if it is required by the model.
+   * eg: nomic-embed-text-v1 requires a query prefix for embedding/searching.
+   * @param {string|string[]} textInput - The text to embed.
+   * @returns {string|string[]} The text with the prefix applied.
+   */
+  #applyQueryPrefix(textInput) {
+    if (!this.queryPrefix) return textInput;
+    if (Array.isArray(textInput))
+      textInput = textInput.map((text) => `${this.queryPrefix}${text}`);
+    else textInput = `${this.queryPrefix}${textInput}`;
+    return textInput;
+  }
+
+  /**
+   * Embed a single text input.
+   * @param {string|string[]} textInput - The text to embed.
+   * @returns {Promise<Array<number>>} The embedded text.
+   */
  async embedTextInput(textInput) {
+    textInput = this.#applyQueryPrefix(textInput);
    const result = await this.embedChunks(
      Array.isArray(textInput) ? textInput : [textInput]
    );