mirror of
https://github.com/Mintplex-Labs/anything-llm.git
synced 2026-06-13 22:04:11 +03:00
Native Embedder model selection (incl: Multilingual support) (#3835)
* WIP on embedder selection TODO: apply splitting and query prefixes (if applicable) * wip on upsert * Support base model support nomic-text-embed-v1 support multilingual-e5-small Add prefixing for both embedding and query for RAG tasks Add chunking prefix to all vector dbs to apply prefix when possible Show dropdown and auto-pull on new selection * norm translations * move supported models to constants handle null seelction or invalid selection on dropdown update comments * dev * patch text splitter maximums for now * normalize translations * add tests for splitter functionality * normalize --------- Co-authored-by: shatfield4 <seanhatfield5@gmail.com>
This commit is contained in:
104
server/__tests__/utils/TextSplitter/index.test.js
Normal file
104
server/__tests__/utils/TextSplitter/index.test.js
Normal file
@@ -0,0 +1,104 @@
|
||||
const { TextSplitter } = require("../../../utils/TextSplitter");
|
||||
const _ = require("lodash");
|
||||
|
||||
describe("TextSplitter", () => {
|
||||
test("should split long text into n sized chunks", async () => {
|
||||
const text = "This is a test text to be split into chunks".repeat(2);
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: 20,
|
||||
chunkOverlap: 0,
|
||||
});
|
||||
const chunks = await textSplitter.splitText(text);
|
||||
expect(chunks.length).toEqual(5);
|
||||
});
|
||||
|
||||
test("applies chunk overlap of 20 characters on invalid chunkOverlap", async () => {
|
||||
const text = "This is a test text to be split into chunks".repeat(2);
|
||||
const textSplitter = new TextSplitter({
|
||||
chunkSize: 30,
|
||||
});
|
||||
const chunks = await textSplitter.splitText(text);
|
||||
expect(chunks.length).toEqual(6);
|
||||
});
|
||||
|
||||
test("does not allow chunkOverlap to be greater than chunkSize", async () => {
|
||||
expect(() => {
|
||||
new TextSplitter({
|
||||
chunkSize: 20,
|
||||
chunkOverlap: 21,
|
||||
});
|
||||
}).toThrow();
|
||||
});
|
||||
|
||||
test("applies specific metadata to stringifyHeader to each chunk", async () => {
|
||||
const metadata = {
|
||||
id: "123e4567-e89b-12d3-a456-426614174000",
|
||||
url: "https://example.com",
|
||||
title: "Example",
|
||||
docAuthor: "John Doe",
|
||||
published: "2021-01-01",
|
||||
chunkSource: "link://https://example.com",
|
||||
description: "This is a test text to be split into chunks",
|
||||
};
|
||||
const chunkHeaderMeta = TextSplitter.buildHeaderMeta(metadata);
|
||||
expect(chunkHeaderMeta).toEqual({
|
||||
sourceDocument: metadata.title,
|
||||
source: metadata.url,
|
||||
published: metadata.published,
|
||||
});
|
||||
});
|
||||
|
||||
test("applies a valid chunkPrefix to each chunk", async () => {
|
||||
const text = "This is a test text to be split into chunks".repeat(2);
|
||||
let textSplitter = new TextSplitter({
|
||||
chunkSize: 20,
|
||||
chunkOverlap: 0,
|
||||
chunkPrefix: "testing: ",
|
||||
});
|
||||
let chunks = await textSplitter.splitText(text);
|
||||
expect(chunks.length).toEqual(5);
|
||||
expect(chunks.every(chunk => chunk.startsWith("testing: "))).toBe(true);
|
||||
|
||||
textSplitter = new TextSplitter({
|
||||
chunkSize: 20,
|
||||
chunkOverlap: 0,
|
||||
chunkPrefix: "testing2: ",
|
||||
});
|
||||
chunks = await textSplitter.splitText(text);
|
||||
expect(chunks.length).toEqual(5);
|
||||
expect(chunks.every(chunk => chunk.startsWith("testing2: "))).toBe(true);
|
||||
|
||||
textSplitter = new TextSplitter({
|
||||
chunkSize: 20,
|
||||
chunkOverlap: 0,
|
||||
chunkPrefix: undefined,
|
||||
});
|
||||
chunks = await textSplitter.splitText(text);
|
||||
expect(chunks.length).toEqual(5);
|
||||
expect(chunks.every(chunk => !chunk.startsWith(": "))).toBe(true);
|
||||
|
||||
textSplitter = new TextSplitter({
|
||||
chunkSize: 20,
|
||||
chunkOverlap: 0,
|
||||
chunkPrefix: "",
|
||||
});
|
||||
chunks = await textSplitter.splitText(text);
|
||||
expect(chunks.length).toEqual(5);
|
||||
expect(chunks.every(chunk => !chunk.startsWith(": "))).toBe(true);
|
||||
|
||||
// Applied chunkPrefix with chunkHeaderMeta
|
||||
textSplitter = new TextSplitter({
|
||||
chunkSize: 20,
|
||||
chunkOverlap: 0,
|
||||
chunkHeaderMeta: TextSplitter.buildHeaderMeta({
|
||||
title: "Example",
|
||||
url: "https://example.com",
|
||||
published: "2021-01-01",
|
||||
}),
|
||||
chunkPrefix: "testing3: ",
|
||||
});
|
||||
chunks = await textSplitter.splitText(text);
|
||||
expect(chunks.length).toEqual(5);
|
||||
expect(chunks.every(chunk => chunk.startsWith("testing3: <document_metadata>"))).toBe(true);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user