Native Embedder model selection (incl: Multilingual support) (#3835)

* WIP on embedder selection TODO: apply splitting and query prefixes (if applicable) * wip on upsert * Support base model support nomic-text-embed-v1 support multilingual-e5-small Add prefixing for both embedding and query for RAG tasks Add chunking prefix to all vector dbs to apply prefix when possible Show dropdown and auto-pull on new selection * norm translations * move supported models to constants handle null seelction or invalid selection on dropdown update comments * dev * patch text splitter maximums for now * normalize translations * add tests for splitter functionality * normalize --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
2026-06-15 23:20:32 +03:00 · 2025-07-22 10:07:20 -07:00
parent 31a8ead823
commit 2c19dd09ed
44 changed files with 463 additions and 80 deletions
--- a/server/utils/TextSplitter/index.js
+++ b/server/utils/TextSplitter/index.js
@@ -20,22 +20,16 @@ function isNullOrNaN(value) {

 class TextSplitter {
  #splitter;
+
+  /**
+   * Creates a new TextSplitter instance.
+   * @param {Object} config
+   * @param {string} [config.chunkPrefix = ""] - Prefix to be added to the start of each chunk.
+   * @param {number} [config.chunkSize = 1000] - The size of each chunk.
+   * @param {number} [config.chunkOverlap = 20] - The overlap between chunks.
+   * @param {Object} [config.chunkHeaderMeta = null] - Metadata to be added to the start of each chunk - will come after the prefix.
+   */
  constructor(config = {}) {
-    /*
-      config can be a ton of things depending on what is required or optional by the specific splitter.
-      Non-splitter related keys
-      {
-        splitByFilename: string, // TODO
-      }
-      ------
-      Default: "RecursiveCharacterTextSplitter"
-      Config: {
-        chunkSize: number,
-        chunkOverlap: number,
-        chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
-      }
-      ------
-    */
    this.config = config;
    this.#splitter = this.#setSplitter(config);
  }
@@ -124,20 +118,41 @@ class TextSplitter {
  }

  /**
-   *  Creates a string of metadata to be prepended to each chunk.
+   * Apply the chunk prefix to the text if it is present.
+   * @param {string} text - The text to apply the prefix to.
+   * @returns {string} The text with the embedder model prefix applied.
+   */
+  #applyPrefix(text = "") {
+    if (!this.config.chunkPrefix) return text;
+    return `${this.config.chunkPrefix}${text}`;
+  }
+
+  /**
+   * Creates a string of metadata to be prepended to each chunk.
+   * Will additionally prepend a prefix to the text if it was provided (requirement for some embedders).
+   * @returns {string} The text with the embedder model prefix applied.
   */
  stringifyHeader() {
-    if (!this.config.chunkHeaderMeta) return null;
    let content = "";
+    if (!this.config.chunkHeaderMeta) return this.#applyPrefix(content);
    Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
      if (!key || !value) return;
      content += `${key}: ${value}\n`;
    });

-    if (!content) return null;
-    return `<document_metadata>\n${content}</document_metadata>\n\n`;
+    if (!content) return this.#applyPrefix(content);
+    return this.#applyPrefix(
+      `<document_metadata>\n${content}</document_metadata>\n\n`
+    );
  }

+  /**
+   * Sets the splitter to use a defined config passes to other subclasses.
+   * @param {Object} config
+   * @param {string} [config.chunkPrefix = ""] - Prefix to be added to the start of each chunk.
+   * @param {number} [config.chunkSize = 1000] - The size of each chunk.
+   * @param {number} [config.chunkOverlap = 20] - The overlap between chunks.
+   */
  #setSplitter(config = {}) {
    // if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
    return new RecursiveSplitter({
@@ -160,7 +175,11 @@ class RecursiveSplitter {
    const {
      RecursiveCharacterTextSplitter,
    } = require("@langchain/textsplitters");
-    this.log(`Will split with`, { chunkSize, chunkOverlap });
+    this.log(`Will split with`, {
+      chunkSize,
+      chunkOverlap,
+      chunkHeader: chunkHeader ? `${chunkHeader?.slice(0, 50)}...` : null,
+    });
    this.chunkHeader = chunkHeader;
    this.engine = new RecursiveCharacterTextSplitter({
      chunkSize,