mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2026-06-15 23:20:32 +03:00
Native Embedder model selection (incl: Multilingual support) (#3835)
* WIP on embedder selection TODO: apply splitting and query prefixes (if applicable) * wip on upsert * Support base model support nomic-text-embed-v1 support multilingual-e5-small Add prefixing for both embedding and query for RAG tasks Add chunking prefix to all vector dbs to apply prefix when possible Show dropdown and auto-pull on new selection * norm translations * move supported models to constants handle null seelction or invalid selection on dropdown update comments * dev * patch text splitter maximums for now * normalize translations * add tests for splitter functionality * normalize --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
This commit is contained in:
@@ -20,22 +20,16 @@ function isNullOrNaN(value) {
|
||||
|
||||
class TextSplitter {
|
||||
#splitter;
|
||||
|
||||
/**
|
||||
* Creates a new TextSplitter instance.
|
||||
* @param {Object} config
|
||||
* @param {string} [config.chunkPrefix = ""] - Prefix to be added to the start of each chunk.
|
||||
* @param {number} [config.chunkSize = 1000] - The size of each chunk.
|
||||
* @param {number} [config.chunkOverlap = 20] - The overlap between chunks.
|
||||
* @param {Object} [config.chunkHeaderMeta = null] - Metadata to be added to the start of each chunk - will come after the prefix.
|
||||
*/
|
||||
constructor(config = {}) {
|
||||
/*
|
||||
config can be a ton of things depending on what is required or optional by the specific splitter.
|
||||
Non-splitter related keys
|
||||
{
|
||||
splitByFilename: string, // TODO
|
||||
}
|
||||
------
|
||||
Default: "RecursiveCharacterTextSplitter"
|
||||
Config: {
|
||||
chunkSize: number,
|
||||
chunkOverlap: number,
|
||||
chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
|
||||
}
|
||||
------
|
||||
*/
|
||||
this.config = config;
|
||||
this.#splitter = this.#setSplitter(config);
|
||||
}
|
||||
@@ -124,20 +118,41 @@ class TextSplitter {
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a string of metadata to be prepended to each chunk.
|
||||
* Apply the chunk prefix to the text if it is present.
|
||||
* @param {string} text - The text to apply the prefix to.
|
||||
* @returns {string} The text with the embedder model prefix applied.
|
||||
*/
|
||||
#applyPrefix(text = "") {
|
||||
if (!this.config.chunkPrefix) return text;
|
||||
return `${this.config.chunkPrefix}${text}`;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a string of metadata to be prepended to each chunk.
|
||||
* Will additionally prepend a prefix to the text if it was provided (requirement for some embedders).
|
||||
* @returns {string} The text with the embedder model prefix applied.
|
||||
*/
|
||||
stringifyHeader() {
|
||||
if (!this.config.chunkHeaderMeta) return null;
|
||||
let content = "";
|
||||
if (!this.config.chunkHeaderMeta) return this.#applyPrefix(content);
|
||||
Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
|
||||
if (!key || !value) return;
|
||||
content += `${key}: ${value}\n`;
|
||||
});
|
||||
|
||||
if (!content) return null;
|
||||
return `<document_metadata>\n${content}</document_metadata>\n\n`;
|
||||
if (!content) return this.#applyPrefix(content);
|
||||
return this.#applyPrefix(
|
||||
`<document_metadata>\n${content}</document_metadata>\n\n`
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Sets the splitter to use a defined config passes to other subclasses.
|
||||
* @param {Object} config
|
||||
* @param {string} [config.chunkPrefix = ""] - Prefix to be added to the start of each chunk.
|
||||
* @param {number} [config.chunkSize = 1000] - The size of each chunk.
|
||||
* @param {number} [config.chunkOverlap = 20] - The overlap between chunks.
|
||||
*/
|
||||
#setSplitter(config = {}) {
|
||||
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
|
||||
return new RecursiveSplitter({
|
||||
@@ -160,7 +175,11 @@ class RecursiveSplitter {
|
||||
const {
|
||||
RecursiveCharacterTextSplitter,
|
||||
} = require("@langchain/textsplitters");
|
||||
this.log(`Will split with`, { chunkSize, chunkOverlap });
|
||||
this.log(`Will split with`, {
|
||||
chunkSize,
|
||||
chunkOverlap,
|
||||
chunkHeader: chunkHeader ? `${chunkHeader?.slice(0, 50)}...` : null,
|
||||
});
|
||||
this.chunkHeader = chunkHeader;
|
||||
this.engine = new RecursiveCharacterTextSplitter({
|
||||
chunkSize,
|
||||
|
||||
Reference in New Issue
Block a user