diff --git a/.github/workflows/dev-build.yaml b/.github/workflows/dev-build.yaml
index e699a630f..023525b26 100644
--- a/.github/workflows/dev-build.yaml
+++ b/.github/workflows/dev-build.yaml
@@ -6,7 +6,7 @@ concurrency:
on:
push:
- branches: ['4034-version-control'] # put your current branch to create a build. Core team only.
+ branches: ['multilingual-native-embedder-selection'] # put your current branch to create a build. Core team only.
paths-ignore:
- '**.md'
- 'cloud-deployments/*'
diff --git a/docker/.env.example b/docker/.env.example
index 6a5d8b331..7244bdff1 100644
--- a/docker/.env.example
+++ b/docker/.env.example
@@ -140,6 +140,10 @@ GID='1000'
###########################################
######## Embedding API SElECTION ##########
###########################################
+# This will be the assumed default embedding seleciton and model
+# EMBEDDING_ENGINE='native'
+# EMBEDDING_MODEL_PREF='Xenova/all-MiniLM-L6-v2'
+
# Only used if you are using an LLM that does not natively support embedding (openai or Azure)
# EMBEDDING_ENGINE='openai'
# OPEN_AI_KEY=sk-xxxx
diff --git a/frontend/src/components/EmbeddingSelection/NativeEmbeddingOptions/index.jsx b/frontend/src/components/EmbeddingSelection/NativeEmbeddingOptions/index.jsx
index e3f974b8b..52babce15 100644
--- a/frontend/src/components/EmbeddingSelection/NativeEmbeddingOptions/index.jsx
+++ b/frontend/src/components/EmbeddingSelection/NativeEmbeddingOptions/index.jsx
@@ -1,12 +1,100 @@
-import { useTranslation } from "react-i18next";
+import { useEffect, useState } from "react";
+import { Link } from "react-router-dom";
+import System from "@/models/system";
+
+export default function NativeEmbeddingOptions({ settings }) {
+ const [loading, setLoading] = useState(true);
+ const [availableModels, setAvailableModels] = useState([]);
+ const [selectedModel, setSelectedModel] = useState(
+ settings?.EmbeddingModelPref
+ );
+ const [selectedModelInfo, setSelectedModelInfo] = useState();
+
+ useEffect(() => {
+ System.customModels("native-embedder")
+ .then(({ models }) => {
+ if (models?.length > 0) {
+ setAvailableModels(models);
+ const _selectedModel =
+ models.find((model) => model.id === settings?.EmbeddingModelPref) ??
+ models[0];
+ setSelectedModel(_selectedModel.id);
+ setSelectedModelInfo(_selectedModel);
+ }
+ })
+ .finally(() => {
+ setLoading(false);
+ });
+ }, []);
+
+ useEffect(() => {
+ if (!availableModels?.length || !selectedModel) return;
+ setSelectedModelInfo(
+ availableModels.find((model) => model.id === selectedModel)
+ );
+ }, [selectedModel, availableModels]);
-export default function NativeEmbeddingOptions() {
- const { t } = useTranslation();
return (
-
-
- {t("embedding.provider.description")}
-
+
+
+
+
+
+
+ {selectedModelInfo && (
+
+
+ {selectedModelInfo?.description}
+
+
+ Trained on: {selectedModelInfo?.lang}
+
+
+ Download Size: {selectedModelInfo?.size}
+
+
+ View model card on Hugging Face →
+
+
+ )}
+
);
}
diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx
index 15b13a003..094aa7f53 100644
--- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx
+++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx
@@ -122,7 +122,7 @@ const Citation = memo(({ source, onClick, textSizeClass }) => {
});
function omitChunkHeader(text) {
- if (!text.startsWith("")) return text;
+ if (!text.includes("")) return text;
return text.split("")[1].trim();
}
diff --git a/frontend/src/locales/ar/common.js b/frontend/src/locales/ar/common.js
index 791eea10b..6df193b50 100644
--- a/frontend/src/locales/ar/common.js
+++ b/frontend/src/locales/ar/common.js
@@ -406,8 +406,6 @@ const TRANSLATIONS = {
"التضمين هو عملية تحويل النص إلى متجهات. هذه البيانات مطلوبة لتحويل ملفاتك ومطالباتك إلى تنسيق يمكن لـ إني ثينك إلْلْمْ استخدامه للمعالجة.",
provider: {
title: "موفر التضمين",
- description:
- "لا يلزم إجراء أي إعداد عند استخدام محرك التضمين الأصلي الخاص بـ إني ثينك إلْلْمْ.",
},
},
text: {
diff --git a/frontend/src/locales/da/common.js b/frontend/src/locales/da/common.js
index 89d4ce549..4ee233adb 100644
--- a/frontend/src/locales/da/common.js
+++ b/frontend/src/locales/da/common.js
@@ -408,8 +408,6 @@ const TRANSLATIONS = {
"Indlejring er processen med at omdanne tekst til vektorer. Disse legitimationsoplysninger er nødvendige for at omdanne dine filer og prompts til et format, som AnythingLLM kan bruge til behandling.",
provider: {
title: "Indlejringsudbyder",
- description:
- "Ingen opsætning er nødvendig, når du bruger AnythingLLM's indbyggede indlejringsmotor.",
},
},
text: {
diff --git a/frontend/src/locales/de/common.js b/frontend/src/locales/de/common.js
index b4ece82bd..cd29dad0d 100644
--- a/frontend/src/locales/de/common.js
+++ b/frontend/src/locales/de/common.js
@@ -597,8 +597,6 @@ const TRANSLATIONS = {
"Einbettung ist der Prozess, Text in Vektoren umzuwandeln. Diese Anmeldeinformationen sind erforderlich, um Ihre Dateien und Prompts in ein Format umzuwandeln, das AnythingLLM zur Verarbeitung verwenden kann.",
provider: {
title: "Einbettungsanbieter",
- description:
- "Bei Verwendung der nativen Einbettungs-Engine von AnythingLLM ist keine Einrichtung erforderlich.",
},
},
text: {
diff --git a/frontend/src/locales/en/common.js b/frontend/src/locales/en/common.js
index 6dad27ed1..e70fa29ee 100644
--- a/frontend/src/locales/en/common.js
+++ b/frontend/src/locales/en/common.js
@@ -620,8 +620,6 @@ const TRANSLATIONS = {
"Embedding is the process of turning text into vectors. These credentials are required to turn your files and prompts into a format which AnythingLLM can use to process.",
provider: {
title: "Embedding Provider",
- description:
- "There is no set up required when using AnythingLLM's native embedding engine.",
},
},
diff --git a/frontend/src/locales/es/common.js b/frontend/src/locales/es/common.js
index 5deba2830..e61676af9 100644
--- a/frontend/src/locales/es/common.js
+++ b/frontend/src/locales/es/common.js
@@ -405,8 +405,6 @@ const TRANSLATIONS = {
"La incrustación es el proceso de convertir texto en vectores. Estas credenciales son necesarias para convertir tus archivos y prompts en un formato que AnythingLLM pueda usar para procesar.",
provider: {
title: "Proveedor de incrustación",
- description:
- "No se requiere configuración cuando se utiliza el motor de incrustación nativo de AnythingLLM.",
},
},
text: {
diff --git a/frontend/src/locales/et/common.js b/frontend/src/locales/et/common.js
index b72017854..481be22db 100644
--- a/frontend/src/locales/et/common.js
+++ b/frontend/src/locales/et/common.js
@@ -573,8 +573,6 @@ const TRANSLATIONS = {
"Embedding muudab teksti vektoriteks. Need võtmed on vajalikud, et AnythingLLM saaks sinu failid ja päringud töödelda.",
provider: {
title: "Embedding-i pakkuja",
- description:
- "AnythingLLM-i sisseehitatud embedding-mootor ei vaja seadistust.",
},
},
text: {
diff --git a/frontend/src/locales/fa/common.js b/frontend/src/locales/fa/common.js
index aa76baf72..beae7d039 100644
--- a/frontend/src/locales/fa/common.js
+++ b/frontend/src/locales/fa/common.js
@@ -398,8 +398,6 @@ const TRANSLATIONS = {
"جاسازی فرآیند تبدیل متن به بردارها است. این اعتبارنامهها برای تبدیل فایلها و درخواستهای شما به فرمتی که AnythingLLM بتواند پردازش کند، ضروری هستند.",
provider: {
title: "ارائهدهنده جاسازی",
- description:
- "هنگام استفاده از موتور جاسازی داخلی AnythingLLM نیازی به تنظیمات نیست.",
},
},
text: {
diff --git a/frontend/src/locales/fr/common.js b/frontend/src/locales/fr/common.js
index a50dae802..06f702a0a 100644
--- a/frontend/src/locales/fr/common.js
+++ b/frontend/src/locales/fr/common.js
@@ -406,8 +406,6 @@ const TRANSLATIONS = {
"L'intégration est le processus de transformation du texte en vecteurs. Ces identifiants sont nécessaires pour transformer vos fichiers et invites en un format que AnythingLLM peut utiliser pour traiter.",
provider: {
title: "Fournisseur d'intégration",
- description:
- "Aucune configuration n'est nécessaire lors de l'utilisation du moteur d'intégration natif de AnythingLLM.",
},
},
text: {
diff --git a/frontend/src/locales/he/common.js b/frontend/src/locales/he/common.js
index 15b7523bd..5fabc954d 100644
--- a/frontend/src/locales/he/common.js
+++ b/frontend/src/locales/he/common.js
@@ -394,8 +394,6 @@ const TRANSLATIONS = {
"הטבעה היא תהליך הפיכת טקסט לווקטורים. אישורי הרשאה אלה נדרשים כדי להפוך את הקבצים והבקשות שלך לפורמט ש-AnythingLLM יכול להשתמש בו לעיבוד.",
provider: {
title: "ספק הטבעה",
- description:
- "אין צורך בהגדרה בעת שימוש במנוע ההטבעה המקורי של AnythingLLM.",
},
},
text: {
diff --git a/frontend/src/locales/it/common.js b/frontend/src/locales/it/common.js
index d0f9c574b..12689ac54 100644
--- a/frontend/src/locales/it/common.js
+++ b/frontend/src/locales/it/common.js
@@ -404,8 +404,6 @@ const TRANSLATIONS = {
"L'embedding è il processo di trasformazione del testo in vettori. Queste credenziali sono necessarie per trasformare i file e i prompt in un formato che AnythingLLM può utilizzare per l'elaborazione.",
provider: {
title: "Provider di embedding",
- description:
- "Non è richiesta alcuna configurazione quando si utilizza il motore di embedding nativo di AnythingLLM.",
},
},
text: {
diff --git a/frontend/src/locales/ja/common.js b/frontend/src/locales/ja/common.js
index 0213e4eb2..d3fa754b5 100644
--- a/frontend/src/locales/ja/common.js
+++ b/frontend/src/locales/ja/common.js
@@ -406,8 +406,6 @@ const TRANSLATIONS = {
"埋め込みとは、テキストをベクトルに変換するプロセスです。これらの認証情報は、ファイルやプロンプトをAnythingLLMが処理できるフォーマットに変換するために必要です。",
provider: {
title: "埋め込みプロバイダー",
- description:
- "AnythingLLMのネイティブ埋め込みエンジンを使用する場合、特に設定は必要ありません。",
},
},
text: {
diff --git a/frontend/src/locales/ko/common.js b/frontend/src/locales/ko/common.js
index 89bf00498..c9afa6584 100644
--- a/frontend/src/locales/ko/common.js
+++ b/frontend/src/locales/ko/common.js
@@ -581,8 +581,6 @@ const TRANSLATIONS = {
"임베딩은 텍스트를 벡터로 변환하는 과정입니다. 파일과 프롬프트를 AnythingLLM이 처리할 수 있는 형식으로 변환하려면 이러한 인증이 필요합니다.",
provider: {
title: "임베딩 제공자",
- description:
- "AnythingLLM의 기본 임베딩 엔진을 사용할 때는 설정이 필요하지 않습니다.",
},
},
text: {
diff --git a/frontend/src/locales/lv/common.js b/frontend/src/locales/lv/common.js
index 461133432..86d13f2cb 100644
--- a/frontend/src/locales/lv/common.js
+++ b/frontend/src/locales/lv/common.js
@@ -592,8 +592,6 @@ const TRANSLATIONS = {
"Iegulšana ir process, ar kuru teksts tiek pārveidots vektoros. Šie akreditācijas dati ir nepieciešami, lai pārveidotu jūsu failus un vaicājumus formātā, kuru AnythingLLM var izmantot apstrādei.",
provider: {
title: "Iegulšanas pakalpojuma sniedzējs",
- description:
- "Nav nepieciešama iestatīšana, izmantojot AnythingLLM iebūvēto iegulšanas dzinēju.",
},
},
text: {
diff --git a/frontend/src/locales/nl/common.js b/frontend/src/locales/nl/common.js
index 49581f280..46b6a2f57 100644
--- a/frontend/src/locales/nl/common.js
+++ b/frontend/src/locales/nl/common.js
@@ -401,8 +401,6 @@ const TRANSLATIONS = {
"Inbedding is het proces van het omzetten van tekst in vectoren. Deze inloggegevens zijn vereist om je bestanden en prompts om te zetten naar een formaat dat AnythingLLM kan gebruiken om te verwerken.",
provider: {
title: "Inbedding Provider",
- description:
- "Er is geen instelling vereist bij gebruik van de ingebouwde inbeddingengine van AnythingLLM.",
},
},
text: {
diff --git a/frontend/src/locales/normalizeEn.mjs b/frontend/src/locales/normalizeEn.mjs
index a2d7ad1e5..e2eb42f3c 100644
--- a/frontend/src/locales/normalizeEn.mjs
+++ b/frontend/src/locales/normalizeEn.mjs
@@ -94,6 +94,11 @@ function normalizeTranslations(lang, source, target, subdir = null) {
);
}
+ // If a non-en file has a key that is NOT in the en file, it will be removed
+ for (const key of Object.keys(normalized)) {
+ if (!source[key]) delete normalized[key];
+ }
+
return normalized;
}
diff --git a/frontend/src/locales/pl/common.js b/frontend/src/locales/pl/common.js
index f5c029cca..a2131303f 100644
--- a/frontend/src/locales/pl/common.js
+++ b/frontend/src/locales/pl/common.js
@@ -597,8 +597,6 @@ const TRANSLATIONS = {
"Embedding to proces przekształcania tekstu na wektory. Poświadczenia są wymagane do przekształcenia plików i tekstu za pomocą wybranego modelu.",
provider: {
title: "Model używany do tworzenia embeddingów",
- description:
- "Podczas korzystania z natywnego silnika osadzania AnythingLLM nie jest wymagana żadna konfiguracja.",
},
},
text: {
diff --git a/frontend/src/locales/pt_BR/common.js b/frontend/src/locales/pt_BR/common.js
index 44601872a..53197c098 100644
--- a/frontend/src/locales/pt_BR/common.js
+++ b/frontend/src/locales/pt_BR/common.js
@@ -579,8 +579,6 @@ const TRANSLATIONS = {
"Vínculo é o processo de transformar texto em vetores. Essas credenciais são necessárias para processar arquivos e prompts.",
provider: {
title: "Provedor de Vínculo",
- description:
- "Nenhuma configuração é necessária ao usar o mecanismo nativo do AnythingLLM.",
},
},
text: {
diff --git a/frontend/src/locales/ru/common.js b/frontend/src/locales/ru/common.js
index 239f49c1b..6cf54bdac 100644
--- a/frontend/src/locales/ru/common.js
+++ b/frontend/src/locales/ru/common.js
@@ -410,8 +410,6 @@ const TRANSLATIONS = {
"Встраивание - это процесс превращения текста в векторы. Эти учетные данные необходимы для превращения ваших файлов и подсказок в формат, который AnythingLLM может использовать для обработки.",
provider: {
title: "Поставщик встраивания",
- description:
- "Нет необходимости в настройке при использовании встроенного механизма встраивания AnythingLLM.",
},
},
text: {
diff --git a/frontend/src/locales/tr/common.js b/frontend/src/locales/tr/common.js
index 0dd48d09c..58493b4eb 100644
--- a/frontend/src/locales/tr/common.js
+++ b/frontend/src/locales/tr/common.js
@@ -401,8 +401,6 @@ const TRANSLATIONS = {
"Gömme, metni vektörlere dönüştürme sürecidir. Dosyalarınızın ve komutlarınızın işlenebilmesi için AnythingLLM, bu kimlik bilgilerine ihtiyaç duyar.",
provider: {
title: "Embedding Sağlayıcısı",
- description:
- "AnythingLLM'nin yerel gömme motoru kullanıldığında ek bir kurulum gerekmez.",
},
},
text: {
diff --git a/frontend/src/locales/vn/common.js b/frontend/src/locales/vn/common.js
index fc8ea8bd9..c03a9161a 100644
--- a/frontend/src/locales/vn/common.js
+++ b/frontend/src/locales/vn/common.js
@@ -400,8 +400,6 @@ const TRANSLATIONS = {
"Embedding is the process of turning text into vectors. These credentials are required to turn your files and prompts into a format which AnythingLLM can use to process.",
provider: {
title: "Embedding Provider",
- description:
- "There is no set up required when using AnythingLLM's native embedding engine.",
},
},
text: {
diff --git a/frontend/src/locales/zh/common.js b/frontend/src/locales/zh/common.js
index 82e15293f..2a27f7736 100644
--- a/frontend/src/locales/zh/common.js
+++ b/frontend/src/locales/zh/common.js
@@ -557,7 +557,6 @@ const TRANSLATIONS = {
"嵌入是将文本转换为矢量的过程。需要这些凭据才能将你的文件和提示转换为 AnythingLLM 可以用来处理的格式。",
provider: {
title: "嵌入引擎提供商",
- description: "使用 AnythingLLM 的本机嵌入引擎时不需要设置。",
},
},
text: {
diff --git a/frontend/src/locales/zh_TW/common.js b/frontend/src/locales/zh_TW/common.js
index 46f8df3a4..92ba19e6f 100644
--- a/frontend/src/locales/zh_TW/common.js
+++ b/frontend/src/locales/zh_TW/common.js
@@ -389,7 +389,6 @@ const TRANSLATIONS = {
"嵌入是將文字轉換成向量的過程。這些憑證是用於將您的檔案和提示詞轉換成 AnythingLLM 可以處理的格式。",
provider: {
title: "向量嵌入提供者",
- description: "使用 AnythingLLM 的原生嵌入引擎時,不需要任何設定。",
},
},
text: {
diff --git a/server/.env.example b/server/.env.example
index 65e974810..df0b20082 100644
--- a/server/.env.example
+++ b/server/.env.example
@@ -138,6 +138,10 @@ SIG_SALT='salt' # Please generate random string at least 32 chars long.
###########################################
######## Embedding API SElECTION ##########
###########################################
+# This will be the assumed default embedding seleciton and model
+# EMBEDDING_ENGINE='native'
+# EMBEDDING_MODEL_PREF='Xenova/all-MiniLM-L6-v2'
+
# Only used if you are using an LLM that does not natively support embedding (openai or Azure)
# EMBEDDING_ENGINE='openai'
# OPEN_AI_KEY=sk-xxxx
diff --git a/server/__tests__/utils/TextSplitter/index.test.js b/server/__tests__/utils/TextSplitter/index.test.js
new file mode 100644
index 000000000..8ca0160c1
--- /dev/null
+++ b/server/__tests__/utils/TextSplitter/index.test.js
@@ -0,0 +1,104 @@
+const { TextSplitter } = require("../../../utils/TextSplitter");
+const _ = require("lodash");
+
+describe("TextSplitter", () => {
+ test("should split long text into n sized chunks", async () => {
+ const text = "This is a test text to be split into chunks".repeat(2);
+ const textSplitter = new TextSplitter({
+ chunkSize: 20,
+ chunkOverlap: 0,
+ });
+ const chunks = await textSplitter.splitText(text);
+ expect(chunks.length).toEqual(5);
+ });
+
+ test("applies chunk overlap of 20 characters on invalid chunkOverlap", async () => {
+ const text = "This is a test text to be split into chunks".repeat(2);
+ const textSplitter = new TextSplitter({
+ chunkSize: 30,
+ });
+ const chunks = await textSplitter.splitText(text);
+ expect(chunks.length).toEqual(6);
+ });
+
+ test("does not allow chunkOverlap to be greater than chunkSize", async () => {
+ expect(() => {
+ new TextSplitter({
+ chunkSize: 20,
+ chunkOverlap: 21,
+ });
+ }).toThrow();
+ });
+
+ test("applies specific metadata to stringifyHeader to each chunk", async () => {
+ const metadata = {
+ id: "123e4567-e89b-12d3-a456-426614174000",
+ url: "https://example.com",
+ title: "Example",
+ docAuthor: "John Doe",
+ published: "2021-01-01",
+ chunkSource: "link://https://example.com",
+ description: "This is a test text to be split into chunks",
+ };
+ const chunkHeaderMeta = TextSplitter.buildHeaderMeta(metadata);
+ expect(chunkHeaderMeta).toEqual({
+ sourceDocument: metadata.title,
+ source: metadata.url,
+ published: metadata.published,
+ });
+ });
+
+ test("applies a valid chunkPrefix to each chunk", async () => {
+ const text = "This is a test text to be split into chunks".repeat(2);
+ let textSplitter = new TextSplitter({
+ chunkSize: 20,
+ chunkOverlap: 0,
+ chunkPrefix: "testing: ",
+ });
+ let chunks = await textSplitter.splitText(text);
+ expect(chunks.length).toEqual(5);
+ expect(chunks.every(chunk => chunk.startsWith("testing: "))).toBe(true);
+
+ textSplitter = new TextSplitter({
+ chunkSize: 20,
+ chunkOverlap: 0,
+ chunkPrefix: "testing2: ",
+ });
+ chunks = await textSplitter.splitText(text);
+ expect(chunks.length).toEqual(5);
+ expect(chunks.every(chunk => chunk.startsWith("testing2: "))).toBe(true);
+
+ textSplitter = new TextSplitter({
+ chunkSize: 20,
+ chunkOverlap: 0,
+ chunkPrefix: undefined,
+ });
+ chunks = await textSplitter.splitText(text);
+ expect(chunks.length).toEqual(5);
+ expect(chunks.every(chunk => !chunk.startsWith(": "))).toBe(true);
+
+ textSplitter = new TextSplitter({
+ chunkSize: 20,
+ chunkOverlap: 0,
+ chunkPrefix: "",
+ });
+ chunks = await textSplitter.splitText(text);
+ expect(chunks.length).toEqual(5);
+ expect(chunks.every(chunk => !chunk.startsWith(": "))).toBe(true);
+
+ // Applied chunkPrefix with chunkHeaderMeta
+ textSplitter = new TextSplitter({
+ chunkSize: 20,
+ chunkOverlap: 0,
+ chunkHeaderMeta: TextSplitter.buildHeaderMeta({
+ title: "Example",
+ url: "https://example.com",
+ published: "2021-01-01",
+ }),
+ chunkPrefix: "testing3: ",
+ });
+ chunks = await textSplitter.splitText(text);
+ expect(chunks.length).toEqual(5);
+ expect(chunks.every(chunk => chunk.startsWith("testing3: "))).toBe(true);
+ });
+});
diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js
index df2eca134..999e75029 100644
--- a/server/models/systemSettings.js
+++ b/server/models/systemSettings.js
@@ -8,6 +8,7 @@ const prisma = require("../utils/prisma");
const { v4 } = require("uuid");
const { MetaGenerator } = require("../utils/boot/MetaGenerator");
const { PGVector } = require("../utils/vectorDbProviders/pgvector");
+const { NativeEmbedder } = require("../utils/EmbeddingEngines/native");
const { getBaseLLMProviderModel } = require("../utils/helpers");
function isNullOrNaN(value) {
@@ -194,6 +195,7 @@ const SystemSettings = {
const { hasVectorCachedFiles } = require("../utils/files");
const llmProvider = process.env.LLM_PROVIDER;
const vectorDB = process.env.VECTOR_DB;
+ const embeddingEngine = process.env.EMBEDDING_ENGINE ?? "native";
return {
// --------------------------------------------------------
// General Settings
@@ -208,11 +210,14 @@ const SystemSettings = {
// --------------------------------------------------------
// Embedder Provider Selection Settings & Configs
// --------------------------------------------------------
- EmbeddingEngine: process.env.EMBEDDING_ENGINE,
+ EmbeddingEngine: embeddingEngine,
HasExistingEmbeddings: await this.hasEmbeddings(), // check if they have any currently embedded documents active in workspaces.
HasCachedEmbeddings: hasVectorCachedFiles(), // check if they any currently cached embedded docs.
EmbeddingBasePath: process.env.EMBEDDING_BASE_PATH,
- EmbeddingModelPref: process.env.EMBEDDING_MODEL_PREF,
+ EmbeddingModelPref:
+ embeddingEngine === "native"
+ ? NativeEmbedder._getEmbeddingModel()
+ : process.env.EMBEDDING_MODEL_PREF,
EmbeddingModelMaxChunkLength:
process.env.EMBEDDING_MODEL_MAX_CHUNK_LENGTH,
VoyageAiApiKey: !!process.env.VOYAGEAI_API_KEY,
diff --git a/server/storage/models/.gitignore b/server/storage/models/.gitignore
index 5e83df7bc..e73faa055 100644
--- a/server/storage/models/.gitignore
+++ b/server/storage/models/.gitignore
@@ -9,4 +9,5 @@ gemini
togetherAi
tesseract
ppio
-context-windows/*
\ No newline at end of file
+context-windows/*
+MintplexLabs
\ No newline at end of file
diff --git a/server/utils/EmbeddingEngines/native/constants.js b/server/utils/EmbeddingEngines/native/constants.js
new file mode 100644
index 000000000..76c4d96c3
--- /dev/null
+++ b/server/utils/EmbeddingEngines/native/constants.js
@@ -0,0 +1,63 @@
+const SUPPORTED_NATIVE_EMBEDDING_MODELS = {
+ "Xenova/all-MiniLM-L6-v2": {
+ maxConcurrentChunks: 25,
+ // Right now, this is NOT the token length, and is instead the number of characters
+ // that can be processed in a single pass. So we override to 1,000 characters.
+ // roughtly the max number of tokens assuming 2 characters per token. (undershooting)
+ // embeddingMaxChunkLength: 512, (from the model card)
+ embeddingMaxChunkLength: 1_000,
+ chunkPrefix: "",
+ queryPrefix: "",
+ apiInfo: {
+ id: "Xenova/all-MiniLM-L6-v2",
+ name: "all-MiniLM-L6-v2",
+ description:
+ "A lightweight and fast model for embedding text. The default model for AnythingLLM.",
+ lang: "English",
+ size: "23MB",
+ modelCard: "https://huggingface.co/Xenova/all-MiniLM-L6-v2",
+ },
+ },
+ "Xenova/nomic-embed-text-v1": {
+ maxConcurrentChunks: 5,
+ // Right now, this is NOT the token length, and is instead the number of characters
+ // that can be processed in a single pass. So we override to 16,000 characters.
+ // roughtly the max number of tokens assuming 2 characters per token. (undershooting)
+ // embeddingMaxChunkLength: 8192, (from the model card)
+ embeddingMaxChunkLength: 16_000,
+ chunkPrefix: "search_document: ",
+ queryPrefix: "search_query: ",
+ apiInfo: {
+ id: "Xenova/nomic-embed-text-v1",
+ name: "nomic-embed-text-v1",
+ description:
+ "A high-performing open embedding model with a large token context window. Requires more processing power and memory.",
+ lang: "English",
+ size: "139MB",
+ modelCard: "https://huggingface.co/Xenova/nomic-embed-text-v1",
+ },
+ },
+ "MintplexLabs/multilingual-e5-small": {
+ maxConcurrentChunks: 5,
+ // Right now, this is NOT the token length, and is instead the number of characters
+ // that can be processed in a single pass. So we override to 1,000 characters.
+ // roughtly the max number of tokens assuming 2 characters per token. (undershooting)
+ // embeddingMaxChunkLength: 512, (from the model card)
+ embeddingMaxChunkLength: 1_000,
+ chunkPrefix: "passage: ",
+ queryPrefix: "query: ",
+ apiInfo: {
+ id: "MintplexLabs/multilingual-e5-small",
+ name: "multilingual-e5-small",
+ description:
+ "A larger multilingual embedding model that supports 100+ languages. Requires more processing power and memory.",
+ lang: "100+ languages",
+ size: "487MB",
+ modelCard: "https://huggingface.co/intfloat/multilingual-e5-small",
+ },
+ },
+};
+
+module.exports = {
+ SUPPORTED_NATIVE_EMBEDDING_MODELS,
+};
diff --git a/server/utils/EmbeddingEngines/native/index.js b/server/utils/EmbeddingEngines/native/index.js
index 550a14e23..9142d3b3a 100644
--- a/server/utils/EmbeddingEngines/native/index.js
+++ b/server/utils/EmbeddingEngines/native/index.js
@@ -2,37 +2,114 @@ const path = require("path");
const fs = require("fs");
const { toChunks } = require("../../helpers");
const { v4 } = require("uuid");
+const { SUPPORTED_NATIVE_EMBEDDING_MODELS } = require("./constants");
class NativeEmbedder {
+ static defaultModel = "Xenova/all-MiniLM-L6-v2";
+
+ /**
+ * Supported embedding models for native.
+ * @type {Record}
+ */
+ static supportedModels = SUPPORTED_NATIVE_EMBEDDING_MODELS;
+
// This is a folder that Mintplex Labs hosts for those who cannot capture the HF model download
// endpoint for various reasons. This endpoint is not guaranteed to be active or maintained
// and may go offline at any time at Mintplex Labs's discretion.
#fallbackHost = "https://cdn.anythingllm.com/support/models/";
constructor() {
- // Model Card: https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
- this.model = "Xenova/all-MiniLM-L6-v2";
+ this.model = this.getEmbeddingModel();
+ this.modelInfo = this.getEmbedderInfo();
this.cacheDir = path.resolve(
process.env.STORAGE_DIR
? path.resolve(process.env.STORAGE_DIR, `models`)
: path.resolve(__dirname, `../../../storage/models`)
);
- this.modelPath = path.resolve(this.cacheDir, "Xenova", "all-MiniLM-L6-v2");
+ this.modelPath = path.resolve(this.cacheDir, ...this.model.split("/"));
this.modelDownloaded = fs.existsSync(this.modelPath);
// Limit of how many strings we can process in a single pass to stay with resource or network limits
- this.maxConcurrentChunks = 25;
- this.embeddingMaxChunkLength = 1_000;
+ this.maxConcurrentChunks = this.modelInfo.maxConcurrentChunks;
+ this.embeddingMaxChunkLength = this.modelInfo.embeddingMaxChunkLength;
// Make directory when it does not exist in existing installations
if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir);
- this.log("Initialized");
+ this.log(`Initialized ${this.model}`);
}
log(text, ...args) {
console.log(`\x1b[36m[NativeEmbedder]\x1b[0m ${text}`, ...args);
}
+ /**
+ * Get the selected model from the environment variable.
+ * @returns {string}
+ */
+ static _getEmbeddingModel() {
+ const envModel =
+ process.env.EMBEDDING_MODEL_PREF ?? NativeEmbedder.defaultModel;
+ if (NativeEmbedder.supportedModels?.[envModel]) return envModel;
+ return NativeEmbedder.defaultModel;
+ }
+
+ get embeddingPrefix() {
+ return NativeEmbedder.supportedModels[this.model]?.chunkPrefix || "";
+ }
+
+ get queryPrefix() {
+ return NativeEmbedder.supportedModels[this.model]?.queryPrefix || "";
+ }
+
+ /**
+ * Get the available models in an API response format
+ * we can use to populate the frontend dropdown.
+ * @returns {{id: string, name: string, description: string, lang: string, size: string, modelCard: string}[]}
+ */
+ static availableModels() {
+ return Object.values(NativeEmbedder.supportedModels).map(
+ (model) => model.apiInfo
+ );
+ }
+
+ /**
+ * Get the embedding model to use.
+ * We only support a few models and will default to the default model if the environment variable is not set or not supported.
+ *
+ * Why only a few? Because we need to mirror them on the CDN so non-US users can download them.
+ * eg: "Xenova/all-MiniLM-L6-v2"
+ * eg: "Xenova/nomic-embed-text-v1"
+ * @returns {string}
+ */
+ getEmbeddingModel() {
+ const envModel =
+ process.env.EMBEDDING_MODEL_PREF ?? NativeEmbedder.defaultModel;
+ if (NativeEmbedder.supportedModels?.[envModel]) return envModel;
+ return NativeEmbedder.defaultModel;
+ }
+
+ /**
+ * Get the embedding model info.
+ *
+ * Will always fallback to the default model if the model is not supported.
+ * @returns {Object}
+ */
+ getEmbedderInfo() {
+ const model = this.getEmbeddingModel();
+ return NativeEmbedder.supportedModels[model];
+ }
+
#tempfilePath() {
const filename = `${v4()}.tmp`;
const tmpPath = process.env.STORAGE_DIR
@@ -124,7 +201,27 @@ class NativeEmbedder {
throw fetchResponse.error;
}
+ /**
+ * Apply the query prefix to the text input if it is required by the model.
+ * eg: nomic-embed-text-v1 requires a query prefix for embedding/searching.
+ * @param {string|string[]} textInput - The text to embed.
+ * @returns {string|string[]} The text with the prefix applied.
+ */
+ #applyQueryPrefix(textInput) {
+ if (!this.queryPrefix) return textInput;
+ if (Array.isArray(textInput))
+ textInput = textInput.map((text) => `${this.queryPrefix}${text}`);
+ else textInput = `${this.queryPrefix}${textInput}`;
+ return textInput;
+ }
+
+ /**
+ * Embed a single text input.
+ * @param {string|string[]} textInput - The text to embed.
+ * @returns {Promise>} The embedded text.
+ */
async embedTextInput(textInput) {
+ textInput = this.#applyQueryPrefix(textInput);
const result = await this.embedChunks(
Array.isArray(textInput) ? textInput : [textInput]
);
diff --git a/server/utils/TextSplitter/index.js b/server/utils/TextSplitter/index.js
index fe6fe95cc..c3f03bfb8 100644
--- a/server/utils/TextSplitter/index.js
+++ b/server/utils/TextSplitter/index.js
@@ -20,22 +20,16 @@ function isNullOrNaN(value) {
class TextSplitter {
#splitter;
+
+ /**
+ * Creates a new TextSplitter instance.
+ * @param {Object} config
+ * @param {string} [config.chunkPrefix = ""] - Prefix to be added to the start of each chunk.
+ * @param {number} [config.chunkSize = 1000] - The size of each chunk.
+ * @param {number} [config.chunkOverlap = 20] - The overlap between chunks.
+ * @param {Object} [config.chunkHeaderMeta = null] - Metadata to be added to the start of each chunk - will come after the prefix.
+ */
constructor(config = {}) {
- /*
- config can be a ton of things depending on what is required or optional by the specific splitter.
- Non-splitter related keys
- {
- splitByFilename: string, // TODO
- }
- ------
- Default: "RecursiveCharacterTextSplitter"
- Config: {
- chunkSize: number,
- chunkOverlap: number,
- chunkHeaderMeta: object | null, // Gets appended to top of each chunk as metadata
- }
- ------
- */
this.config = config;
this.#splitter = this.#setSplitter(config);
}
@@ -124,20 +118,41 @@ class TextSplitter {
}
/**
- * Creates a string of metadata to be prepended to each chunk.
+ * Apply the chunk prefix to the text if it is present.
+ * @param {string} text - The text to apply the prefix to.
+ * @returns {string} The text with the embedder model prefix applied.
+ */
+ #applyPrefix(text = "") {
+ if (!this.config.chunkPrefix) return text;
+ return `${this.config.chunkPrefix}${text}`;
+ }
+
+ /**
+ * Creates a string of metadata to be prepended to each chunk.
+ * Will additionally prepend a prefix to the text if it was provided (requirement for some embedders).
+ * @returns {string} The text with the embedder model prefix applied.
*/
stringifyHeader() {
- if (!this.config.chunkHeaderMeta) return null;
let content = "";
+ if (!this.config.chunkHeaderMeta) return this.#applyPrefix(content);
Object.entries(this.config.chunkHeaderMeta).map(([key, value]) => {
if (!key || !value) return;
content += `${key}: ${value}\n`;
});
- if (!content) return null;
- return `\n${content}\n\n`;
+ if (!content) return this.#applyPrefix(content);
+ return this.#applyPrefix(
+ `\n${content}\n\n`
+ );
}
+ /**
+ * Sets the splitter to use a defined config passes to other subclasses.
+ * @param {Object} config
+ * @param {string} [config.chunkPrefix = ""] - Prefix to be added to the start of each chunk.
+ * @param {number} [config.chunkSize = 1000] - The size of each chunk.
+ * @param {number} [config.chunkOverlap = 20] - The overlap between chunks.
+ */
#setSplitter(config = {}) {
// if (!config?.splitByFilename) {// TODO do something when specific extension is present? }
return new RecursiveSplitter({
@@ -160,7 +175,11 @@ class RecursiveSplitter {
const {
RecursiveCharacterTextSplitter,
} = require("@langchain/textsplitters");
- this.log(`Will split with`, { chunkSize, chunkOverlap });
+ this.log(`Will split with`, {
+ chunkSize,
+ chunkOverlap,
+ chunkHeader: chunkHeader ? `${chunkHeader?.slice(0, 50)}...` : null,
+ });
this.chunkHeader = chunkHeader;
this.engine = new RecursiveCharacterTextSplitter({
chunkSize,
diff --git a/server/utils/helpers/customModels.js b/server/utils/helpers/customModels.js
index de855f0b3..e0a1fb820 100644
--- a/server/utils/helpers/customModels.js
+++ b/server/utils/helpers/customModels.js
@@ -1,7 +1,6 @@
const { fetchOpenRouterModels } = require("../AiProviders/openRouter");
const { fetchApiPieModels } = require("../AiProviders/apipie");
const { perplexityModels } = require("../AiProviders/perplexity");
-const { togetherAiModels } = require("../AiProviders/togetherAi");
const { fireworksAiModels } = require("../AiProviders/fireworksAi");
const { ElevenLabsTTS } = require("../TextToSpeech/elevenLabs");
const { fetchNovitaModels } = require("../AiProviders/novita");
@@ -34,6 +33,8 @@ const SUPPORT_CUSTOM_MODELS = [
"ppio",
"dpais",
"moonshotai",
+ // Embedding Engines
+ "native-embedder",
];
async function getCustomModels(provider = "", apiKey = null, basePath = null) {
@@ -87,6 +88,8 @@ async function getCustomModels(provider = "", apiKey = null, basePath = null) {
return await getDellProAiStudioModels(basePath);
case "moonshotai":
return await getMoonshotAiModels(apiKey);
+ case "native-embedder":
+ return await getNativeEmbedderModels();
default:
return { models: [], error: "Invalid provider for custom models" };
}
@@ -678,6 +681,11 @@ async function getDellProAiStudioModels(basePath = null) {
}
}
+function getNativeEmbedderModels() {
+ const { NativeEmbedder } = require("../EmbeddingEngines/native");
+ return { models: NativeEmbedder.availableModels(), error: null };
+}
+
async function getMoonshotAiModels(_apiKey = null) {
const apiKey =
_apiKey === true
diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js
index ce28e73a1..d92cd36df 100644
--- a/server/utils/helpers/updateENV.js
+++ b/server/utils/helpers/updateENV.js
@@ -288,7 +288,7 @@ const KEY_MAPPING = {
EmbeddingModelPref: {
envKey: "EMBEDDING_MODEL_PREF",
checks: [isNotEmpty],
- postUpdate: [handleVectorStoreReset],
+ postUpdate: [handleVectorStoreReset, downloadEmbeddingModelIfRequired],
},
EmbeddingModelMaxChunkLength: {
envKey: "EMBEDDING_MODEL_MAX_CHUNK_LENGTH",
@@ -927,6 +927,22 @@ async function handleVectorStoreReset(key, prevValue, nextValue) {
return false;
}
+/**
+ * Downloads the embedding model in background if the user has selected a different model
+ * - Only supported for the native embedder
+ * - Must have the native embedder selected prior (otherwise will download on embed)
+ */
+async function downloadEmbeddingModelIfRequired(key, prevValue, nextValue) {
+ if (prevValue === nextValue) return;
+ if (key !== "EmbeddingModelPref" || process.env.EMBEDDING_ENGINE !== "native")
+ return;
+
+ const { NativeEmbedder } = require("../EmbeddingEngines/native");
+ if (!NativeEmbedder.supportedModels[nextValue]) return; // if the model is not supported, don't download it
+ new NativeEmbedder().embedderClient();
+ return false;
+}
+
/**
* Validates the Postgres connection string for the PGVector options.
* @param {string} input - The Postgres connection string to validate.
diff --git a/server/utils/vectorDbProviders/astra/index.js b/server/utils/vectorDbProviders/astra/index.js
index 6e77ccf4d..b34a8d83a 100644
--- a/server/utils/vectorDbProviders/astra/index.js
+++ b/server/utils/vectorDbProviders/astra/index.js
@@ -206,6 +206,7 @@ const AstraDB = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
+ chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/chroma/index.js b/server/utils/vectorDbProviders/chroma/index.js
index 4ec2e6d79..bc12818fd 100644
--- a/server/utils/vectorDbProviders/chroma/index.js
+++ b/server/utils/vectorDbProviders/chroma/index.js
@@ -252,6 +252,7 @@ const Chroma = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
+ chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/lance/index.js b/server/utils/vectorDbProviders/lance/index.js
index 572a2615a..563095fe5 100644
--- a/server/utils/vectorDbProviders/lance/index.js
+++ b/server/utils/vectorDbProviders/lance/index.js
@@ -328,6 +328,7 @@ const LanceDb = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
+ chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/milvus/index.js b/server/utils/vectorDbProviders/milvus/index.js
index 1a7ab2e41..2ddaad567 100644
--- a/server/utils/vectorDbProviders/milvus/index.js
+++ b/server/utils/vectorDbProviders/milvus/index.js
@@ -204,6 +204,7 @@ const Milvus = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
+ chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/pgvector/index.js b/server/utils/vectorDbProviders/pgvector/index.js
index 740a1f85a..057f1b799 100644
--- a/server/utils/vectorDbProviders/pgvector/index.js
+++ b/server/utils/vectorDbProviders/pgvector/index.js
@@ -536,6 +536,7 @@ const PGVector = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
+ chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/pinecone/index.js b/server/utils/vectorDbProviders/pinecone/index.js
index 55e3f0bc3..c5c55acb5 100644
--- a/server/utils/vectorDbProviders/pinecone/index.js
+++ b/server/utils/vectorDbProviders/pinecone/index.js
@@ -150,6 +150,7 @@ const PineconeDB = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
+ chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/qdrant/index.js b/server/utils/vectorDbProviders/qdrant/index.js
index e8eee0726..50fe5fab3 100644
--- a/server/utils/vectorDbProviders/qdrant/index.js
+++ b/server/utils/vectorDbProviders/qdrant/index.js
@@ -222,6 +222,7 @@ const QDrant = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
+ chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/weaviate/index.js b/server/utils/vectorDbProviders/weaviate/index.js
index bb0111e76..2385c5e8e 100644
--- a/server/utils/vectorDbProviders/weaviate/index.js
+++ b/server/utils/vectorDbProviders/weaviate/index.js
@@ -263,6 +263,7 @@ const Weaviate = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
+ chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);
diff --git a/server/utils/vectorDbProviders/zilliz/index.js b/server/utils/vectorDbProviders/zilliz/index.js
index c882a8572..ab866f4ed 100644
--- a/server/utils/vectorDbProviders/zilliz/index.js
+++ b/server/utils/vectorDbProviders/zilliz/index.js
@@ -197,6 +197,7 @@ const Zilliz = {
20
),
chunkHeaderMeta: TextSplitter.buildHeaderMeta(metadata),
+ chunkPrefix: EmbedderEngine?.embeddingPrefix,
});
const textChunks = await textSplitter.splitText(pageContent);