From 33119ae2a4e928bb54dcd25380fde16117c52e82 Mon Sep 17 00:00:00 2001
From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com>
Date: Sat, 28 Feb 2026 16:00:13 +0100
Subject: [PATCH] refactor: remove deprecated settings-based local search
engines (#2344)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
* refactor: remove deprecated settings-based local search engines
The old settings-based local engines (research_papers, project_docs,
personal_notes, local_all) are fully superseded by the database-backed
Collection system with CollectionSearchEngine and LibraryRAGSearchEngine.
- Delete LocalAllSearchEngine and LocalSearchEngine classes
- Remove 58 settings entries from default_settings.json
- Remove local engine registration from search_engines_config.py
- Remove local_search_engines() function
- Clean up LocalEmbeddingManager: remove 14 dead methods and unused attrs
- Remove Docker volume mounts for local_collections
- Update security whitelist, rate limiter, bearer config
- Remove dead force_reindex code path in research_functions.py
- Update docs to reference Collections UI
- Remove/update all associated tests
- Regenerate golden master settings
* fix: address review comments from djpetti
- Revert unintentional formatting change in theme options (keep compact inline format)
- Restore unicode arrow character (→) that was escaped to \u2192 by JSON serializer
- Rename search_engine_local.py → local_embedding_manager.py since it only contains
LocalEmbeddingManager now (no search engines)
- Remove unused chunk_size, chunk_overlap, cache_dir params from LocalEmbeddingManager
- Update all imports and references across codebase
---
.github/scripts/check-file-writes.sh | 2 +-
.secrets.baseline | 4 +-
bearer.yml | 1 -
docker-compose.unraid.yml | 5 -
docker-compose.yml | 14 -
docs/deployment/unraid.md | 25 +-
docs/docker-compose-guide.md | 12 +-
docs/faq.md | 29 +-
.../api/research_functions.py | 7 -
src/local_deep_research/defaults/__init__.py | 1 -
.../defaults/default_settings.json | 900 ----------
.../research_library/routes/rag_routes.py | 4 +-
.../services/library_rag_service.py | 4 +-
.../security/module_whitelist.py | 5 +-
src/local_deep_research/web/api.py | 2 +-
.../engines/local_embedding_manager.py | 290 ++++
.../engines/search_engine_local.py | 1467 -----------------
.../engines/search_engine_local_all.py | 170 --
.../rate_limiting/tracker.py | 1 -
.../search_engines_config.py | 105 +-
tests/rate_limiting/test_rate_limiting.py | 10 -
.../routes/test_rag_routes.py | 2 +-
.../test_absolute_module_paths_hook.py | 2 +-
tests/settings/golden_master_settings.json | 900 ----------
.../engines/test_local_embedding_manager.py | 96 ++
.../engines/test_search_engine_local.py | 1073 ------------
.../engines/test_search_engine_local_all.py | 463 ------
.../rate_limiting/test_tracker.py | 31 -
.../test_local_embedding_manager.py | 219 +--
.../test_search_engines_config.py | 254 ---
unraid-templates/local-deep-research.xml | 3 -
31 files changed, 454 insertions(+), 5647 deletions(-)
create mode 100644 src/local_deep_research/web_search_engines/engines/local_embedding_manager.py
delete mode 100644 src/local_deep_research/web_search_engines/engines/search_engine_local.py
delete mode 100644 src/local_deep_research/web_search_engines/engines/search_engine_local_all.py
create mode 100644 tests/web_search_engines/engines/test_local_embedding_manager.py
delete mode 100644 tests/web_search_engines/engines/test_search_engine_local.py
delete mode 100644 tests/web_search_engines/engines/test_search_engine_local_all.py
diff --git a/.github/scripts/check-file-writes.sh b/.github/scripts/check-file-writes.sh
index d140fabfd..74b0820ad 100755
--- a/.github/scripts/check-file-writes.sh
+++ b/.github/scripts/check-file-writes.sh
@@ -181,7 +181,7 @@ if [ -n "$ALL_MATCHES" ]; then
# Filter system config files (not user data)
if [ "$skip_line" -eq 0 ]; then
- if echo "$line" | grep -qE "web/app_factory\.py|web/server_config\.py|web_search_engines/engines/search_engine_local\.py|document_loaders/bytes_loader\.py"; then
+ if echo "$line" | grep -qE "web/app_factory\.py|web/server_config\.py|document_loaders/bytes_loader\.py"; then
skip_line=1
fi
fi
diff --git a/.secrets.baseline b/.secrets.baseline
index da1b4b1a7..0df2e81b6 100644
--- a/.secrets.baseline
+++ b/.secrets.baseline
@@ -677,7 +677,7 @@
"filename": "src/local_deep_research/defaults/default_settings.json",
"hashed_secret": "7b976de60179f0603eec51250b33aacd56d90ef6",
"is_verified": false,
- "line_number": 5946
+ "line_number": 5046
}
],
"src/local_deep_research/llm/providers/implementations/anthropic.py": [
@@ -5188,5 +5188,5 @@
}
]
},
- "generated_at": "2026-02-28T11:10:23Z"
+ "generated_at": "2026-02-28T12:03:11Z"
}
diff --git a/bearer.yml b/bearer.yml
index f7c85d74b..7bb2ca043 100644
--- a/bearer.yml
+++ b/bearer.yml
@@ -26,7 +26,6 @@ rule:
# for these non-cryptographic uses where collision resistance is not critical.
#
# Usage locations:
- # - search_engine_local.py: Cache key generation for search results
# - research_service.py: Content deduplication hashes
# - search_cache.py: Cache key generation
# - benchmark_service.py: Test result identification
diff --git a/docker-compose.unraid.yml b/docker-compose.unraid.yml
index 3115a0fff..b5d91470a 100644
--- a/docker-compose.unraid.yml
+++ b/docker-compose.unraid.yml
@@ -22,11 +22,6 @@ services:
- /mnt/user/appdata/local-deep-research/data:/data
- /mnt/user/appdata/local-deep-research/scripts:/scripts
- # Optional: Uncomment to add your document directories
- # - /mnt/user/documents/personal:/local_collections/personal_notes:ro
- # - /mnt/user/documents/projects:/local_collections/project_docs:ro
- # - /mnt/user/documents/papers:/local_collections/research_papers:ro
-
ollama:
volumes:
# Override named volume with Unraid path
diff --git a/docker-compose.yml b/docker-compose.yml
index 115a6ed9c..849de6621 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -122,20 +122,6 @@ services:
volumes:
- ldr_data:/data
- ldr_scripts:/scripts
- # ============================================================================
- # LOCAL DOCUMENT COLLECTIONS (Optional)
- # Mount your document directories here to search them with LDR.
- #
- # For Unraid users, replace paths with your Unraid shares:
- # - /mnt/user/documents/personal:/local_collections/personal_notes/:ro
- # - /mnt/user/documents/projects:/local_collections/project_docs/:ro
- # - /mnt/user/papers:/local_collections/research_papers/:ro
- #
- # The :ro (read-only) suffix is recommended for safety.
- # ============================================================================
- - ./local_collections/personal_notes:/local_collections/personal_notes/:ro
- - ./local_collections/project_docs:/local_collections/project_docs/:ro
- - ./local_collections/research_papers:/local_collections/research_papers/:ro
# ============================================================================
# CONTAINER SECURITY — Principle of Least Privilege
# ============================================================================
diff --git a/docs/deployment/unraid.md b/docs/deployment/unraid.md
index 8bdf8dab2..e071ee0d7 100644
--- a/docs/deployment/unraid.md
+++ b/docs/deployment/unraid.md
@@ -117,7 +117,6 @@ All volumes should be under `/mnt/user/appdata/local-deep-research/` for best pr
| `/scripts` | `/mnt/user/appdata/local-deep-research/scripts` | Startup scripts (for Ollama integration) | Yes |
| `/root/.ollama` (ollama) | `/mnt/user/appdata/local-deep-research/ollama` | Downloaded LLM models (5-15GB each) | If using Ollama |
| `/etc/searxng` (searxng) | `/mnt/user/appdata/local-deep-research/searxng` | SearXNG configuration | If using SearXNG |
-| `/local_collections/*` | `/mnt/user/documents/*` | Your document directories to search | Optional |
**Performance Tip:** If your appdata share is set to "cache-only", you can use `/mnt/cache/appdata/local-deep-research/` instead of `/mnt/user/appdata/local-deep-research/` for better performance (bypasses FUSE overhead).
@@ -183,26 +182,12 @@ If running LDR alone with external services:
## 🎮 Using Local Documents
-To search your Unraid shares (documents, notes, etc.):
+To search your local documents, use the **Collections** system in the Web UI:
-**For Template Installation (Method 1):**
-1. Edit the container
-2. Add **Path** mappings under volume configuration:
- - **Container Path:** `/local_collections/personal_notes` → **Host Path:** `/mnt/user/documents/personal` (Read-only)
- - **Container Path:** `/local_collections/project_docs` → **Host Path:** `/mnt/user/documents/projects` (Read-only)
- - **Container Path:** `/local_collections/research_papers` → **Host Path:** `/mnt/user/papers` (Read-only)
-3. Apply changes and restart
-
-**For Docker Compose Installation (Method 2):**
-1. Edit `docker-compose.unraid.yml`
-2. Uncomment the document collection lines and adjust paths:
- ```yaml
- - /mnt/user/documents/personal:/local_collections/personal_notes:ro
- - /mnt/user/documents/projects:/local_collections/project_docs:ro
- ```
-3. Run **Compose Down** then **Compose Up** to apply changes
-
-These paths will then be available in LDR's WebUI Settings for searching.
+1. Open the LDR Web UI and navigate to the **Collections** page
+2. Create a new collection (e.g., "Research Papers", "Project Docs")
+3. Upload documents directly through the browser — no volume mounts needed
+4. Select your collection as a search engine, or use **"Search All Collections"** to search across everything
## 🎯 GPU Acceleration (NVIDIA)
diff --git a/docs/docker-compose-guide.md b/docs/docker-compose-guide.md
index 67ca524f7..cba909c81 100644
--- a/docs/docker-compose-guide.md
+++ b/docs/docker-compose-guide.md
@@ -84,16 +84,8 @@ ports:
### Local Document Collections
-Mount directories to search your own documents:
-
-```yaml
-volumes:
- - ./local_collections/personal_notes:/local_collections/personal_notes/
- - ./local_collections/project_docs:/local_collections/project_docs/
- - /path/to/your/papers:/local_collections/research_papers/:ro
-```
-
-The `:ro` suffix makes mounts read-only for safety.
+Use the **Collections** system in the Web UI to manage your local documents.
+Upload files directly through the Collections page — no volume mounts required.
## Advanced: Cookie Cutter Configuration
diff --git a/docs/faq.md b/docs/faq.md
index f3cff4bad..71416fc58 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -325,32 +325,21 @@ See also: [Environment Variables Documentation](env_configuration.md#openrouter)
## Local Document Search
-### How do I configure local document paths?
+### How do I search my local documents?
-1. **In Web UI**:
- - Settings → Search for "local"
- - Edit "Document Collection Paths"
- - Use absolute paths: `["/home/user/documents", "/data/pdfs"]`
+Use the **Collections** system in the Web UI:
-2. **For Docker**: Mount volumes
- ```bash
- docker run -v /host/path:/container/path ...
- ```
- Then use container path in settings: `["/container/path"]`
+1. **Navigate** to the Collections page in the sidebar
+2. **Create a collection** (e.g., "Research Papers", "Project Docs")
+3. **Upload documents** directly through the UI — supported formats include PDF, TXT, MD, DOCX, and many more
+4. **Search** your collections by selecting them as a search engine, or use **"Search All Collections"** (Library RAG) to search across everything
### Local search not finding documents
Common issues:
-1. **First search is slow** - Initial indexing takes time
-2. **Path format** - Use absolute paths, not relative
-3. **File types** - Ensure supported formats (PDF, TXT, MD, DOCX)
-4. **Permissions** - Check read permissions
-
-### The @format syntax in settings
-
-This is a UI hint to expand environment variables. Replace with actual paths:
-- Change: `"@format ${DOCS_DIR}/personal_notes"`
-- To: `"/home/user/documents/personal_notes"`
+1. **First search is slow** — initial indexing takes time
+2. **File types** — ensure supported formats (PDF, TXT, MD, DOCX)
+3. **Collection not indexed** — re-upload or re-index via the Collections UI
## Performance & Optimization
diff --git a/src/local_deep_research/api/research_functions.py b/src/local_deep_research/api/research_functions.py
index ff5b9aa47..ecdf6afa1 100644
--- a/src/local_deep_research/api/research_functions.py
+++ b/src/local_deep_research/api/research_functions.py
@@ -577,13 +577,6 @@ def analyze_documents(
# Set max results
search.max_results = max_results
- # Force reindex if requested
- if force_reindex and hasattr(search, "embedding_manager"):
- for folder_path in search.folder_paths:
- search.embedding_manager.index_folder(
- folder_path, force_reindex=True
- )
-
# Perform the search
results = search.run(query)
diff --git a/src/local_deep_research/defaults/__init__.py b/src/local_deep_research/defaults/__init__.py
index d25ece016..71c0a5fce 100644
--- a/src/local_deep_research/defaults/__init__.py
+++ b/src/local_deep_research/defaults/__init__.py
@@ -15,7 +15,6 @@ DEFAULTS_DIR = Path(__file__).parent
# Default files available in this package
DEFAULT_FILES = {
"main.toml": DEFAULTS_DIR / "main.toml",
- "local_collections.toml": DEFAULTS_DIR / "local_collections.toml",
"search_engines.toml": DEFAULTS_DIR / "search_engines.toml",
}
diff --git a/src/local_deep_research/defaults/default_settings.json b/src/local_deep_research/defaults/default_settings.json
index 268b7ca32..d8858cfad 100644
--- a/src/local_deep_research/defaults/default_settings.json
+++ b/src/local_deep_research/defaults/default_settings.json
@@ -2393,34 +2393,6 @@
],
"visible": true
},
- "search.engine.local.local_all.display_name": {
- "category": "local_all",
- "description": "Display name to use in the U.I. for this search engine.",
- "editable": false,
- "max_value": null,
- "min_value": null,
- "name": "Display Name",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Local Documents",
- "visible": false
- },
- "search.engine.local.local_all.description": {
- "category": "local_all",
- "description": "Human-readable description of the search engine.",
- "editable": false,
- "max_value": null,
- "min_value": null,
- "name": "Description",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Search only local documents using RAG.",
- "visible": false
- },
"search.engine.DEFAULT_SEARCH_ENGINE": {
"category": "local_all",
"description": "Fallback search engine used when the configured engine is unavailable or has errors.",
@@ -2435,585 +2407,6 @@
"value": "wikipedia",
"visible": true
},
- "search.engine.local.local_all.class_name": {
- "category": "local_all",
- "description": "Internal: Python class implementing local document search. Do not modify.",
- "editable": false,
- "max_value": null,
- "min_value": null,
- "name": "Class Name",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "LocalAllSearchEngine",
- "visible": false
- },
- "search.engine.local.local_all.module_path": {
- "category": "local_all",
- "description": "Internal: Python module path for local search implementation. Do not modify.",
- "editable": false,
- "max_value": null,
- "min_value": null,
- "name": "Module Path",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": ".engines.search_engine_local_all",
- "visible": false
- },
- "search.engine.local.local_all.reliability": {
- "category": "local_all",
- "description": "Reliability score (0-1) for local search. Quality depends on your document collection and indexing.",
- "editable": true,
- "max_value": 1.0,
- "min_value": 0.0,
- "name": "Reliability",
- "options": null,
- "step": 0.05,
- "type": "SEARCH",
- "ui_element": "range",
- "value": 0.85,
- "visible": true
- },
- "search.engine.local.local_all.requires_api_key": {
- "category": "local_all",
- "description": "Local document search does not require any external API keys.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Requires Api Key",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": false,
- "visible": true
- },
- "search.engine.local.local_all.requires_llm": {
- "category": "local_all",
- "description": "Indicates this engine uses the LLM to rerank and filter results for relevance.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Requires Llm",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": true,
- "visible": true
- },
- "search.engine.local.local_all.strengths": {
- "category": "local_all",
- "description": "Advantages: Searches all local document collections at once, works offline, uses your private documents.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Strengths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "searches all local collections",
- "personal documents",
- "offline access"
- ],
- "visible": true
- },
- "search.engine.local.local_all.weaknesses": {
- "category": "local_all",
- "description": "Limitations: May return too many results from mixed collections, requires documents to be indexed first.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Weaknesses",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "may return too many results",
- "requires indexing"
- ],
- "visible": true
- },
- "search.engine.local.personal_notes.cache_dir": {
- "category": "personal_notes",
- "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Cache Dir",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": null,
- "visible": true
- },
- "search.engine.local.personal_notes.chunk_overlap": {
- "category": "personal_notes",
- "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.",
- "editable": true,
- "max_value": null,
- "min_value": 0,
- "name": "Chunk Overlap",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 100,
- "visible": true
- },
- "search.engine.local.personal_notes.chunk_size": {
- "category": "personal_notes",
- "description": "Maximum characters per chunk when splitting documents for RAG indexing. Smaller = more precise, larger = more context.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Chunk Size",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 500,
- "visible": true
- },
- "search.engine.local.personal_notes.description": {
- "category": "personal_notes",
- "description": "Human-readable description of this document collection shown in the UI.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Description",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Personal notes and documents",
- "visible": true
- },
- "search.engine.local.personal_notes.embedding_device": {
- "category": "personal_notes",
- "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Device",
- "options": [
- {
- "label": "CPU",
- "value": "cpu"
- },
- {
- "label": "CUDA",
- "value": "cuda"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "cpu",
- "visible": true
- },
- "search.engine.local.personal_notes.embedding_model": {
- "category": "personal_notes",
- "description": "Model for generating text embeddings. Default 'all-MiniLM-L6-v2' is fast and works well; larger models may improve accuracy.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "all-MiniLM-L6-v2",
- "visible": true
- },
- "search.engine.local.personal_notes.embedding_model_type": {
- "category": "personal_notes",
- "description": "Model provider to use for generating document embeddings.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model Type",
- "options": [
- {
- "label": "SentenceTransformers",
- "value": "sentence_transformers"
- },
- {
- "label": "Ollama",
- "value": "ollama"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "sentence_transformers",
- "visible": true
- },
- "search.engine.local.personal_notes.enabled": {
- "category": "personal_notes",
- "description": "Enable this document collection for searching. Disable if you don't want to index these documents.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Enabled",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": true,
- "visible": true
- },
- "search.engine.local.personal_notes.max_filtered_results": {
- "category": "personal_notes",
- "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Max Filtered Results",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": 10,
- "visible": true
- },
- "search.engine.local.personal_notes.max_results": {
- "category": "personal_notes",
- "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Max Results",
- "options": null,
- "step": 1,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 30,
- "visible": true
- },
- "search.engine.local.personal_notes.name": {
- "category": "personal_notes",
- "description": "Internal identifier for this collection. Used in logs and configuration.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Name",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Personal Notes",
- "visible": true
- },
- "search.engine.local.personal_notes.paths": {
- "category": "personal_notes",
- "description": "File paths to include in this collection. Supports directories (recursively indexed) and individual files.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Paths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "/local_collections/personal_notes"
- ],
- "visible": true
- },
- "search.engine.local.personal_notes.reliability": {
- "category": "personal_notes",
- "description": "Reliability score (0-1). Personal notes are rated lower (0.75) as they may contain informal or subjective content.",
- "editable": true,
- "max_value": 1.0,
- "min_value": 0.0,
- "name": "Reliability",
- "options": null,
- "step": 0.05,
- "type": "SEARCH",
- "ui_element": "range",
- "value": 0.75,
- "visible": true
- },
- "search.engine.local.personal_notes.strengths": {
- "category": "personal_notes",
- "description": "Advantages: Access to your personal knowledge, notes, and private documents not available elsewhere.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Strengths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "personal knowledge",
- "notes",
- "private documents"
- ],
- "visible": true
- },
- "search.engine.local.personal_notes.weaknesses": {
- "category": "personal_notes",
- "description": "Limitations: Content may be subjective, informal, or incomplete compared to published sources.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Weaknesses",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "subjective content",
- "informal information"
- ],
- "visible": true
- },
- "search.engine.local.project_docs.cache_dir": {
- "category": "project_docs",
- "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Cache Dir",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": null,
- "visible": true
- },
- "search.engine.local.project_docs.chunk_overlap": {
- "category": "project_docs",
- "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.",
- "editable": true,
- "max_value": null,
- "min_value": 0,
- "name": "Chunk Overlap",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 200,
- "visible": true
- },
- "search.engine.local.project_docs.chunk_size": {
- "category": "project_docs",
- "description": "Maximum characters per chunk when splitting documents for RAG indexing. Larger default (1000) suits technical documentation.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Chunk Size",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 1000,
- "visible": true
- },
- "search.engine.local.project_docs.description": {
- "category": "project_docs",
- "description": "Human-readable description of this document collection shown in the UI.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Description",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Project documentation and specifications",
- "visible": true
- },
- "search.engine.local.project_docs.embedding_device": {
- "category": "project_docs",
- "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Device",
- "options": [
- {
- "label": "CPU",
- "value": "cpu"
- },
- {
- "label": "CUDA",
- "value": "cuda"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "cpu",
- "visible": true
- },
- "search.engine.local.project_docs.embedding_model": {
- "category": "project_docs",
- "description": "Model for generating text embeddings. Default 'all-MiniLM-L6-v2' is fast and works well; larger models may improve accuracy.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "all-MiniLM-L6-v2",
- "visible": true
- },
- "search.engine.local.project_docs.embedding_model_type": {
- "category": "project_docs",
- "description": "Model provider to use for generating document embeddings.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model Type",
- "options": [
- {
- "label": "SentenceTransformers",
- "value": "sentence_transformers"
- },
- {
- "label": "Ollama",
- "value": "ollama"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "sentence_transformers",
- "visible": true
- },
- "search.engine.local.project_docs.enabled": {
- "category": "project_docs",
- "description": "Enable this document collection for searching. Disable if you don't want to index these documents.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Enabled",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": true,
- "visible": true
- },
- "search.engine.local.project_docs.max_filtered_results": {
- "category": "project_docs",
- "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Max Filtered Results",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": 5,
- "visible": true
- },
- "search.engine.local.project_docs.max_results": {
- "category": "project_docs",
- "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Max Results",
- "options": null,
- "step": 1,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 20,
- "visible": true
- },
- "search.engine.local.project_docs.name": {
- "category": "project_docs",
- "description": "Internal identifier for this collection. Used in logs and configuration.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Name",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Project Documents",
- "visible": true
- },
- "search.engine.local.project_docs.paths": {
- "category": "project_docs",
- "description": "File paths to include in this collection. Supports directories (recursively indexed) and individual files.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Paths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "/local_collections/project_docs/"
- ],
- "visible": true
- },
- "search.engine.local.project_docs.reliability": {
- "category": "project_docs",
- "description": "Reliability score (0-1). Project docs rated moderately (0.8) as they are semi-formal technical content.",
- "editable": true,
- "max_value": 1.0,
- "min_value": 0.0,
- "name": "Reliability",
- "options": null,
- "step": 0.05,
- "type": "SEARCH",
- "ui_element": "range",
- "value": 0.9,
- "visible": true
- },
- "search.engine.local.project_docs.strengths": {
- "category": "project_docs",
- "description": "Advantages: Access to project-specific technical docs, READMEs, and internal documentation not available online.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Strengths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "project documentation",
- "specifications",
- "internal documents"
- ],
- "visible": true
- },
- "search.engine.local.project_docs.weaknesses": {
- "category": "project_docs",
- "description": "Limitations: May be outdated if docs not maintained, limited scope to specific projects.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Weaknesses",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "no external information",
- "limited to organizational knowledge"
- ],
- "visible": true
- },
"search.engine.web.pubmed.display_name": {
"category": "pubmed",
"description": "Display name to use in the U.I. for this search engine.",
@@ -3277,243 +2670,6 @@
],
"visible": true
},
- "search.engine.local.research_papers.cache_dir": {
- "category": "research_papers",
- "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Cache Dir",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": null,
- "visible": true
- },
- "search.engine.local.research_papers.chunk_overlap": {
- "category": "research_papers",
- "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.",
- "editable": true,
- "max_value": null,
- "min_value": 0,
- "name": "Chunk Overlap",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 150,
- "visible": true
- },
- "search.engine.local.research_papers.chunk_size": {
- "category": "research_papers",
- "description": "Maximum characters per chunk when splitting papers for RAG indexing. Default (800) balances context and precision.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Chunk Size",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 800,
- "visible": true
- },
- "search.engine.local.research_papers.description": {
- "category": "research_papers",
- "description": "Human-readable description of this document collection shown in the UI.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Description",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Academic research papers and articles",
- "visible": true
- },
- "search.engine.local.research_papers.embedding_device": {
- "category": "research_papers",
- "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Device",
- "options": [
- {
- "label": "CPU",
- "value": "cpu"
- },
- {
- "label": "CUDA",
- "value": "cuda"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "cpu",
- "visible": true
- },
- "search.engine.local.research_papers.embedding_model": {
- "category": "research_papers",
- "description": "Model for generating text embeddings. Consider 'allenai/specter' for academic papers if available.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "all-MiniLM-L6-v2",
- "visible": true
- },
- "search.engine.local.research_papers.embedding_model_type": {
- "category": "research_papers",
- "description": "Model provider to use for generating document embeddings.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model Type",
- "options": [
- {
- "label": "SentenceTransformers",
- "value": "sentence_transformers"
- },
- {
- "label": "Ollama",
- "value": "ollama"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "sentence_transformers",
- "visible": true
- },
- "search.engine.local.research_papers.enabled": {
- "category": "research_papers",
- "description": "Enable this document collection for searching. Disable if you don't have local research papers.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Enabled",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": true,
- "visible": true
- },
- "search.engine.local.research_papers.max_filtered_results": {
- "category": "research_papers",
- "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Max Filtered Results",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": 5,
- "visible": true
- },
- "search.engine.local.research_papers.max_results": {
- "category": "research_papers",
- "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Max Results",
- "options": null,
- "step": 1,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 20,
- "visible": true
- },
- "search.engine.local.research_papers.name": {
- "category": "research_papers",
- "description": "Internal identifier for this collection. Used in logs and configuration.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Name",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Research Papers",
- "visible": true
- },
- "search.engine.local.research_papers.paths": {
- "category": "research_papers",
- "description": "File paths containing academic papers. Supports PDFs and text formats; directories are indexed recursively.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Paths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "/local_collections/research_papers/"
- ],
- "visible": true
- },
- "search.engine.local.research_papers.reliability": {
- "category": "research_papers",
- "description": "Reliability score (0-1). Research papers rated high (0.95) as they are peer-reviewed academic content.",
- "editable": true,
- "max_value": 1.0,
- "min_value": 0.0,
- "name": "Reliability",
- "options": null,
- "step": 0.05,
- "type": "SEARCH",
- "ui_element": "range",
- "value": 0.85,
- "visible": true
- },
- "search.engine.local.research_papers.strengths": {
- "category": "research_papers",
- "description": "Advantages: Access to peer-reviewed academic content, scientific papers, and scholarly research in your collection.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Strengths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "academic research",
- "scientific papers",
- "scholarly content"
- ],
- "visible": true
- },
- "search.engine.local.research_papers.weaknesses": {
- "category": "research_papers",
- "description": "Limitations: Limited to papers in your collection, may be outdated if not regularly updated.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Weaknesses",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "potentially outdated",
- "limited to collected papers"
- ],
- "visible": true
- },
"search.engine.web.searxng.display_name": {
"category": "searxng",
"description": "Display name to use in the U.I. for this search engine.",
@@ -5132,62 +4288,6 @@
"value": false,
"visible": true
},
- "search.engine.local.local_all.use_in_auto_search": {
- "category": "local_all",
- "description": "Include local documents in auto search mode",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Include in Auto Search",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": true,
- "visible": true
- },
- "search.engine.local.personal_notes.use_in_auto_search": {
- "category": "personal_notes",
- "description": "Include personal notes in auto search mode",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Include in Auto Search",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": false,
- "visible": true
- },
- "search.engine.local.project_docs.use_in_auto_search": {
- "category": "project_docs",
- "description": "Include project documents in auto search mode",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Include in Auto Search",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": false,
- "visible": true
- },
- "search.engine.local.research_papers.use_in_auto_search": {
- "category": "research_papers",
- "description": "Include research papers in auto search mode",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Include in Auto Search",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": false,
- "visible": true
- },
"app.warnings.dismiss_high_context": {
"category": "warnings",
"description": "Dismiss warnings about high context window sizes that may cause memory issues",
diff --git a/src/local_deep_research/research_library/routes/rag_routes.py b/src/local_deep_research/research_library/routes/rag_routes.py
index 3235541f3..d237a9632 100644
--- a/src/local_deep_research/research_library/routes/rag_routes.py
+++ b/src/local_deep_research/research_library/routes/rag_routes.py
@@ -2154,7 +2154,7 @@ def _get_rag_service_for_thread(
Create RAG service for use in background threads (no Flask context).
"""
from ...database.session_context import get_user_db_session
- from ...web_search_engines.engines.search_engine_local import (
+ from ...web_search_engines.engines.local_embedding_manager import (
LocalEmbeddingManager,
)
import json
@@ -2263,8 +2263,6 @@ def _get_rag_service_for_thread(
embedding_manager = LocalEmbeddingManager(
embedding_model=embedding_model,
embedding_model_type=embedding_provider,
- chunk_size=chunk_size,
- chunk_overlap=chunk_overlap,
settings_snapshot=settings_snapshot,
)
embedding_manager.db_password = db_password
diff --git a/src/local_deep_research/research_library/services/library_rag_service.py b/src/local_deep_research/research_library/services/library_rag_service.py
index ca490c3c7..c8f8ab034 100644
--- a/src/local_deep_research/research_library/services/library_rag_service.py
+++ b/src/local_deep_research/research_library/services/library_rag_service.py
@@ -29,7 +29,7 @@ from ...database.models.library import (
from ...database.session_context import get_user_db_session
from ...utilities.type_utils import to_bool
from ...embeddings.splitters import get_text_splitter
-from ...web_search_engines.engines.search_engine_local import (
+from ...web_search_engines.engines.local_embedding_manager import (
LocalEmbeddingManager,
)
from ...security.file_integrity import FileIntegrityManager, FAISSIndexVerifier
@@ -126,8 +126,6 @@ class LibraryRAGService:
self.embedding_manager = LocalEmbeddingManager(
embedding_model=embedding_model,
embedding_model_type=embedding_provider,
- chunk_size=chunk_size,
- chunk_overlap=chunk_overlap,
settings_snapshot=settings_snapshot,
)
diff --git a/src/local_deep_research/security/module_whitelist.py b/src/local_deep_research/security/module_whitelist.py
index 7bc14cd04..b15d754e4 100644
--- a/src/local_deep_research/security/module_whitelist.py
+++ b/src/local_deep_research/security/module_whitelist.py
@@ -40,8 +40,7 @@ ALLOWED_MODULE_PATHS: frozenset[str] = frozenset(
".engines.search_engine_google_pse",
".engines.search_engine_guardian",
".engines.search_engine_library",
- ".engines.search_engine_local",
- ".engines.search_engine_local_all",
+ ".engines.local_embedding_manager",
".engines.search_engine_mojeek",
".engines.search_engine_nasa_ads",
".engines.search_engine_openalex",
@@ -81,8 +80,6 @@ ALLOWED_CLASS_NAMES: frozenset[str] = frozenset(
"GooglePSESearchEngine",
"GuardianSearchEngine",
"LibraryRAGSearchEngine",
- "LocalAllSearchEngine",
- "LocalSearchEngine",
"MetaSearchEngine",
"MojeekSearchEngine",
"NasaAdsSearchEngine",
diff --git a/src/local_deep_research/web/api.py b/src/local_deep_research/web/api.py
index e4adbef4d..e9600d8f2 100644
--- a/src/local_deep_research/web/api.py
+++ b/src/local_deep_research/web/api.py
@@ -423,7 +423,7 @@ def api_analyze_documents():
POST /api/v1/analyze_documents
{
"query": "neural networks in medicine",
- "collection_name": "research_papers", # Required: local collection name
+ "collection_name": "my_collection", # Required: local collection name
"max_results": 20, # Optional: max results to return
"temperature": 0.7, # Optional: LLM temperature
"force_reindex": false # Optional: force reindexing
diff --git a/src/local_deep_research/web_search_engines/engines/local_embedding_manager.py b/src/local_deep_research/web_search_engines/engines/local_embedding_manager.py
new file mode 100644
index 000000000..4f14a9bee
--- /dev/null
+++ b/src/local_deep_research/web_search_engines/engines/local_embedding_manager.py
@@ -0,0 +1,290 @@
+import hashlib
+import threading
+import uuid
+from datetime import UTC, datetime
+from typing import Any, Dict, List, Optional
+
+from langchain_community.embeddings import (
+ HuggingFaceEmbeddings,
+)
+from langchain_core.documents import Document
+from loguru import logger
+
+from ...database.models.library import DocumentChunk
+from ...database.session_context import get_user_db_session
+from ...utilities.url_utils import normalize_url
+
+
+class LocalEmbeddingManager:
+ """Handles embedding generation and storage for local document search"""
+
+ def __init__(
+ self,
+ embedding_model: str = "all-MiniLM-L6-v2",
+ embedding_device: str = "cpu",
+ embedding_model_type: str = "sentence_transformers", # or 'ollama'
+ ollama_base_url: Optional[str] = None,
+ settings_snapshot: Optional[Dict[str, Any]] = None,
+ ):
+ """
+ Initialize the embedding manager for local document search.
+
+ Args:
+ embedding_model: Name of the embedding model to use
+ embedding_device: Device to run embeddings on ('cpu' or 'cuda')
+ embedding_model_type: Type of embedding model ('sentence_transformers' or 'ollama')
+ ollama_base_url: Base URL for Ollama API if using ollama embeddings
+ settings_snapshot: Optional settings snapshot for background threads
+ """
+
+ self.embedding_model = embedding_model
+ self.embedding_device = embedding_device
+ self.embedding_model_type = embedding_model_type
+ self.ollama_base_url = ollama_base_url
+ self.settings_snapshot = settings_snapshot or {}
+
+ # Username for database access (extracted from settings if available)
+ self.username = (
+ settings_snapshot.get("_username") if settings_snapshot else None
+ )
+ # Password for encrypted database access (can be set later)
+ self.db_password = None
+
+ # Initialize the embedding model (with lock for thread-safe lazy init)
+ self._embeddings = None
+ self._embedding_lock = threading.Lock()
+
+ # Vector store cache
+ self.vector_stores = {}
+
+ # Track if this manager has been closed
+ self._closed = False
+
+ def close(self):
+ """Release embedding model resources."""
+ if self._closed:
+ return
+ self._closed = True
+ # Clear embedding model reference to allow garbage collection
+ self._embeddings = None
+ # Clear vector store cache
+ self.vector_stores.clear()
+ logger.debug("LocalEmbeddingManager closed")
+
+ def __enter__(self):
+ """Context manager entry."""
+ return self
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ """Context manager exit - ensures resources are released."""
+ self.close()
+ return False
+
+ @property
+ def embeddings(self):
+ """
+ Lazily initialize embeddings when first accessed.
+ This allows the LocalEmbeddingManager to be created without
+ immediately loading models, which is helpful when no local search is performed.
+
+ Uses double-checked locking to ensure thread-safe initialization.
+ Concurrent SentenceTransformer model loading causes meta tensor errors
+ in PyTorch when multiple threads call model.to(device) simultaneously.
+ """
+ if self._embeddings is None:
+ with self._embedding_lock:
+ if self._embeddings is None:
+ logger.info("Initializing embeddings on first use")
+ self._embeddings = self._initialize_embeddings()
+ return self._embeddings
+
+ def _initialize_embeddings(self):
+ """Initialize the embedding model based on configuration"""
+ try:
+ # Use the new unified embedding system
+ from ...embeddings import get_embeddings
+
+ # Prepare kwargs for provider-specific parameters
+ kwargs = {}
+
+ # Add device for sentence transformers
+ if self.embedding_model_type == "sentence_transformers":
+ kwargs["device"] = self.embedding_device
+
+ # Add base_url for ollama if specified
+ if self.embedding_model_type == "ollama" and self.ollama_base_url:
+ kwargs["base_url"] = normalize_url(self.ollama_base_url)
+
+ logger.info(
+ f"Initializing embeddings with provider={self.embedding_model_type}, model={self.embedding_model}"
+ )
+
+ return get_embeddings(
+ provider=self.embedding_model_type,
+ model=self.embedding_model,
+ settings_snapshot=self.settings_snapshot,
+ **kwargs,
+ )
+ except Exception:
+ logger.exception("Error initializing embeddings")
+ logger.warning(
+ "Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2"
+ )
+ return HuggingFaceEmbeddings(
+ model_name="sentence-transformers/all-MiniLM-L6-v2"
+ )
+
+ def _store_chunks_to_db(
+ self,
+ chunks: List[Document],
+ collection_name: str,
+ source_path: Optional[str] = None,
+ source_id: Optional[int] = None,
+ source_type: str = "local_file",
+ ) -> List[str]:
+ """
+ Store document chunks in the database.
+
+ Args:
+ chunks: List of LangChain Document chunks
+ collection_name: Name of the collection (e.g., 'personal_notes', 'library')
+ source_path: Path to source file (for local files)
+ source_id: ID of source document (for library documents)
+ source_type: Type of source ('local_file' or 'library')
+
+ Returns:
+ List of chunk embedding IDs (UUIDs) for FAISS mapping
+ """
+ if not self.username:
+ logger.warning(
+ "No username available, cannot store chunks in database"
+ )
+ return []
+
+ chunk_ids = []
+
+ try:
+ with get_user_db_session(
+ self.username, self.db_password
+ ) as session:
+ for idx, chunk in enumerate(chunks):
+ # Generate unique hash for chunk
+ chunk_text = chunk.page_content
+ chunk_hash = hashlib.sha256(chunk_text.encode()).hexdigest()
+
+ # Generate unique embedding ID
+ embedding_id = uuid.uuid4().hex
+
+ # Extract metadata
+ metadata = chunk.metadata or {}
+ document_title = metadata.get(
+ "filename", metadata.get("title", "Unknown")
+ )
+
+ # Calculate word count
+ word_count = len(chunk_text.split())
+
+ # Get character positions from metadata if available
+ start_char = metadata.get("start_char", 0)
+ end_char = metadata.get("end_char", len(chunk_text))
+
+ # Check if chunk already exists
+ existing_chunk = (
+ session.query(DocumentChunk)
+ .filter_by(chunk_hash=chunk_hash)
+ .first()
+ )
+
+ if existing_chunk:
+ # Update existing chunk
+ existing_chunk.last_accessed = datetime.now(UTC)
+ chunk_ids.append(existing_chunk.embedding_id)
+ logger.debug(
+ f"Chunk already exists, reusing: {existing_chunk.embedding_id}"
+ )
+ else:
+ # Create new chunk
+ db_chunk = DocumentChunk(
+ chunk_hash=chunk_hash,
+ source_type=source_type,
+ source_id=source_id,
+ source_path=str(source_path)
+ if source_path
+ else None,
+ collection_name=collection_name,
+ chunk_text=chunk_text,
+ chunk_index=idx,
+ start_char=start_char,
+ end_char=end_char,
+ word_count=word_count,
+ embedding_id=embedding_id,
+ embedding_model=self.embedding_model,
+ embedding_model_type=self.embedding_model_type,
+ document_title=document_title,
+ document_metadata=metadata,
+ )
+ session.add(db_chunk)
+ chunk_ids.append(embedding_id)
+
+ session.commit()
+ logger.info(
+ f"Stored {len(chunk_ids)} chunks to database for collection '{collection_name}'"
+ )
+
+ except Exception:
+ logger.exception(
+ f"Error storing chunks to database for collection '{collection_name}'"
+ )
+ return []
+
+ return chunk_ids
+
+ def _delete_chunks_from_db(
+ self,
+ collection_name: str,
+ source_path: Optional[str] = None,
+ source_id: Optional[int] = None,
+ ) -> int:
+ """
+ Delete chunks from database.
+
+ Args:
+ collection_name: Name of the collection
+ source_path: Path to source file (for local files)
+ source_id: ID of source document (for library documents)
+
+ Returns:
+ Number of chunks deleted
+ """
+ if not self.username:
+ logger.warning(
+ "No username available, cannot delete chunks from database"
+ )
+ return 0
+
+ try:
+ with get_user_db_session(
+ self.username, self.db_password
+ ) as session:
+ query = session.query(DocumentChunk).filter_by(
+ collection_name=collection_name
+ )
+
+ if source_path:
+ query = query.filter_by(source_path=str(source_path))
+ if source_id:
+ query = query.filter_by(source_id=source_id)
+
+ count = query.delete()
+ session.commit()
+
+ logger.info(
+ f"Deleted {count} chunks from database for collection '{collection_name}'"
+ )
+ return count
+
+ except Exception:
+ logger.exception(
+ f"Error deleting chunks from database for collection '{collection_name}'"
+ )
+ return 0
diff --git a/src/local_deep_research/web_search_engines/engines/search_engine_local.py b/src/local_deep_research/web_search_engines/engines/search_engine_local.py
deleted file mode 100644
index 9f3d503a2..000000000
--- a/src/local_deep_research/web_search_engines/engines/search_engine_local.py
+++ /dev/null
@@ -1,1467 +0,0 @@
-import hashlib
-import json
-import os
-import threading
-import time
-import uuid
-from concurrent.futures import ProcessPoolExecutor
-from datetime import UTC, datetime
-from pathlib import Path
-from typing import Any, Dict, Iterable, List, Optional
-
-import numpy as np
-from faiss import IndexFlatL2
-from langchain_community.docstore.in_memory import InMemoryDocstore
-from langchain_community.document_loaders import TextLoader
-from langchain_community.embeddings import (
- HuggingFaceEmbeddings,
-)
-from langchain_community.vectorstores import FAISS
-from langchain_core.document_loaders import BaseLoader
-from langchain_core.documents import Document
-from langchain_core.language_models import BaseLLM
-from langchain_text_splitters import RecursiveCharacterTextSplitter
-from loguru import logger
-
-from ...config import search_config
-from ...config.paths import get_cache_directory
-from ...database.models.library import DocumentChunk
-from ...database.session_context import get_user_db_session
-from ...document_loaders import get_loader_for_path, is_extension_supported
-from ...utilities.url_utils import normalize_url
-from ..search_engine_base import BaseSearchEngine
-
-
-def _get_file_loader(file_path: str) -> Optional[BaseLoader]:
- """Get an appropriate document loader for a file based on its extension.
-
- Uses the centralized document_loaders registry which supports 35+ file formats.
- """
- file_path_obj = Path(file_path)
- extension = file_path_obj.suffix.lower()
-
- # Check if extension is supported by the registry
- if is_extension_supported(extension):
- loader = get_loader_for_path(file_path)
- if loader:
- return loader
-
- # Fallback to TextLoader for unknown extensions
- logger.warning(f"Unknown file extension for {file_path}, trying TextLoader")
- try:
- return TextLoader(
- str(file_path), encoding="utf-8", autodetect_encoding=True
- )
- except Exception:
- logger.exception(f"Error creating loader for {file_path}")
- return None
-
-
-def _load_document(file_path: Path) -> List[Document]:
- """
- Loads documents from a file.
-
- Args:
- file_path: The path to the document to load.
-
- Returns:
- The loaded documents, or an empty list if it failed to load.
-
- """
- # Get a loader for this file
- loader = _get_file_loader(str(file_path))
-
- if loader is None:
- # No loader for this filetype.
- return []
-
- try:
- # Load the document
- docs = loader.load()
-
- # Add source path metadata and ID.
- for doc in docs:
- doc.metadata["source"] = str(file_path)
- doc.metadata["filename"] = file_path.name
-
- except Exception:
- logger.exception(f"Error loading {file_path}")
- return []
-
- return docs
-
-
-class LocalEmbeddingManager:
- """Handles embedding generation and storage for local document search"""
-
- def __init__(
- self,
- embedding_model: str = "all-MiniLM-L6-v2",
- embedding_device: str = "cpu",
- embedding_model_type: str = "sentence_transformers", # or 'ollama'
- ollama_base_url: Optional[str] = None,
- chunk_size: int = 1000,
- chunk_overlap: int = 200,
- cache_dir: Optional[str] = None,
- settings_snapshot: Optional[Dict[str, Any]] = None,
- ):
- """
- Initialize the embedding manager for local document search.
-
- Args:
- embedding_model: Name of the embedding model to use
- embedding_device: Device to run embeddings on ('cpu' or 'cuda')
- embedding_model_type: Type of embedding model ('sentence_transformers' or 'ollama')
- ollama_base_url: Base URL for Ollama API if using ollama embeddings
- chunk_size: Size of text chunks for splitting documents
- chunk_overlap: Overlap between chunks
- cache_dir: Directory to store embedding cache and index.
- If None, uses the app's configured cache directory.
- settings_snapshot: Optional settings snapshot for background threads
- """
-
- self.embedding_model = embedding_model
- self.embedding_device = embedding_device
- self.embedding_model_type = embedding_model_type
- self.ollama_base_url = ollama_base_url
- self.chunk_size = chunk_size
- self.chunk_overlap = chunk_overlap
- # Use configured cache directory if not specified
- if cache_dir is None:
- self.cache_dir = get_cache_directory() / "local_search"
- else:
- self.cache_dir = Path(cache_dir)
- self.settings_snapshot = settings_snapshot or {}
-
- # Username for database access (extracted from settings if available)
- self.username = (
- settings_snapshot.get("_username") if settings_snapshot else None
- )
- # Password for encrypted database access (can be set later)
- self.db_password = None
-
- # Create cache directory if it doesn't exist
- self.cache_dir.mkdir(parents=True, exist_ok=True)
-
- # Initialize the embedding model (with lock for thread-safe lazy init)
- self._embeddings = None
- self._embedding_lock = threading.Lock()
-
- # Initialize the text splitter
- self.text_splitter = RecursiveCharacterTextSplitter(
- chunk_size=chunk_size, chunk_overlap=chunk_overlap
- )
-
- # Track indexed folders and their metadata
- self.indexed_folders = self._load_indexed_folders()
-
- # Vector store cache
- self.vector_stores = {}
-
- # Track if this manager has been closed
- self._closed = False
-
- def close(self):
- """Release embedding model resources."""
- if self._closed:
- return
- self._closed = True
- # Clear embedding model reference to allow garbage collection
- self._embeddings = None
- # Clear vector store cache
- self.vector_stores.clear()
- logger.debug("LocalEmbeddingManager closed")
-
- def __enter__(self):
- """Context manager entry."""
- return self
-
- def __exit__(self, exc_type, exc_val, exc_tb):
- """Context manager exit - ensures resources are released."""
- self.close()
- return False
-
- @property
- def embeddings(self):
- """
- Lazily initialize embeddings when first accessed.
- This allows the LocalEmbeddingManager to be created without
- immediately loading models, which is helpful when no local search is performed.
-
- Uses double-checked locking to ensure thread-safe initialization.
- Concurrent SentenceTransformer model loading causes meta tensor errors
- in PyTorch when multiple threads call model.to(device) simultaneously.
- """
- if self._embeddings is None:
- with self._embedding_lock:
- if self._embeddings is None:
- logger.info("Initializing embeddings on first use")
- self._embeddings = self._initialize_embeddings()
- return self._embeddings
-
- def _initialize_embeddings(self):
- """Initialize the embedding model based on configuration"""
- try:
- # Use the new unified embedding system
- from ...embeddings import get_embeddings
-
- # Prepare kwargs for provider-specific parameters
- kwargs = {}
-
- # Add device for sentence transformers
- if self.embedding_model_type == "sentence_transformers":
- kwargs["device"] = self.embedding_device
-
- # Add base_url for ollama if specified
- if self.embedding_model_type == "ollama" and self.ollama_base_url:
- kwargs["base_url"] = normalize_url(self.ollama_base_url)
-
- logger.info(
- f"Initializing embeddings with provider={self.embedding_model_type}, model={self.embedding_model}"
- )
-
- return get_embeddings(
- provider=self.embedding_model_type,
- model=self.embedding_model,
- settings_snapshot=self.settings_snapshot,
- **kwargs,
- )
- except Exception:
- logger.exception("Error initializing embeddings")
- logger.warning(
- "Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2"
- )
- return HuggingFaceEmbeddings(
- model_name="sentence-transformers/all-MiniLM-L6-v2"
- )
-
- def _store_chunks_to_db(
- self,
- chunks: List[Document],
- collection_name: str,
- source_path: Optional[str] = None,
- source_id: Optional[int] = None,
- source_type: str = "local_file",
- ) -> List[str]:
- """
- Store document chunks in the database.
-
- Args:
- chunks: List of LangChain Document chunks
- collection_name: Name of the collection (e.g., 'personal_notes', 'library')
- source_path: Path to source file (for local files)
- source_id: ID of source document (for library documents)
- source_type: Type of source ('local_file' or 'library')
-
- Returns:
- List of chunk embedding IDs (UUIDs) for FAISS mapping
- """
- if not self.username:
- logger.warning(
- "No username available, cannot store chunks in database"
- )
- return []
-
- chunk_ids = []
-
- try:
- with get_user_db_session(
- self.username, self.db_password
- ) as session:
- for idx, chunk in enumerate(chunks):
- # Generate unique hash for chunk
- chunk_text = chunk.page_content
- chunk_hash = hashlib.sha256(chunk_text.encode()).hexdigest()
-
- # Generate unique embedding ID
- embedding_id = uuid.uuid4().hex
-
- # Extract metadata
- metadata = chunk.metadata or {}
- document_title = metadata.get(
- "filename", metadata.get("title", "Unknown")
- )
-
- # Calculate word count
- word_count = len(chunk_text.split())
-
- # Get character positions from metadata if available
- start_char = metadata.get("start_char", 0)
- end_char = metadata.get("end_char", len(chunk_text))
-
- # Check if chunk already exists
- existing_chunk = (
- session.query(DocumentChunk)
- .filter_by(chunk_hash=chunk_hash)
- .first()
- )
-
- if existing_chunk:
- # Update existing chunk
- existing_chunk.last_accessed = datetime.now(UTC)
- chunk_ids.append(existing_chunk.embedding_id)
- logger.debug(
- f"Chunk already exists, reusing: {existing_chunk.embedding_id}"
- )
- else:
- # Create new chunk
- db_chunk = DocumentChunk(
- chunk_hash=chunk_hash,
- source_type=source_type,
- source_id=source_id,
- source_path=str(source_path)
- if source_path
- else None,
- collection_name=collection_name,
- chunk_text=chunk_text,
- chunk_index=idx,
- start_char=start_char,
- end_char=end_char,
- word_count=word_count,
- embedding_id=embedding_id,
- embedding_model=self.embedding_model,
- embedding_model_type=self.embedding_model_type,
- document_title=document_title,
- document_metadata=metadata,
- )
- session.add(db_chunk)
- chunk_ids.append(embedding_id)
-
- session.commit()
- logger.info(
- f"Stored {len(chunk_ids)} chunks to database for collection '{collection_name}'"
- )
-
- except Exception:
- logger.exception(
- f"Error storing chunks to database for collection '{collection_name}'"
- )
- return []
-
- return chunk_ids
-
- def _load_chunks_from_db(
- self, chunk_ids: List[str], username: Optional[str] = None
- ) -> List[Dict[str, Any]]:
- """
- Load chunks from database by their embedding IDs.
-
- Args:
- chunk_ids: List of embedding IDs to load
- username: Username for database access (uses self.username if not provided)
-
- Returns:
- List of chunk dictionaries with content and metadata
- """
- username = username or self.username
- if not username:
- logger.warning(
- "No username available, cannot load chunks from database"
- )
- return []
-
- chunks = []
-
- try:
- with get_user_db_session(username) as session:
- db_chunks = (
- session.query(DocumentChunk)
- .filter(DocumentChunk.embedding_id.in_(chunk_ids))
- .all()
- )
-
- for db_chunk in db_chunks:
- # Update last accessed time
- db_chunk.last_accessed = datetime.now(UTC)
-
- chunks.append(
- {
- "id": db_chunk.embedding_id,
- "content": db_chunk.chunk_text,
- "metadata": {
- "source_type": db_chunk.source_type,
- "source_path": db_chunk.source_path,
- "source_id": db_chunk.source_id,
- "collection": db_chunk.collection_name,
- "chunk_index": db_chunk.chunk_index,
- "word_count": db_chunk.word_count,
- "title": db_chunk.document_title,
- **db_chunk.document_metadata,
- },
- }
- )
-
- session.commit() # Commit the last_accessed updates
-
- except Exception:
- logger.exception("Error loading chunks from database")
- return []
-
- return chunks
-
- def _delete_chunks_from_db(
- self,
- collection_name: str,
- source_path: Optional[str] = None,
- source_id: Optional[int] = None,
- ) -> int:
- """
- Delete chunks from database.
-
- Args:
- collection_name: Name of the collection
- source_path: Path to source file (for local files)
- source_id: ID of source document (for library documents)
-
- Returns:
- Number of chunks deleted
- """
- if not self.username:
- logger.warning(
- "No username available, cannot delete chunks from database"
- )
- return 0
-
- try:
- with get_user_db_session(
- self.username, self.db_password
- ) as session:
- query = session.query(DocumentChunk).filter_by(
- collection_name=collection_name
- )
-
- if source_path:
- query = query.filter_by(source_path=str(source_path))
- if source_id:
- query = query.filter_by(source_id=source_id)
-
- count = query.delete()
- session.commit()
-
- logger.info(
- f"Deleted {count} chunks from database for collection '{collection_name}'"
- )
- return count
-
- except Exception:
- logger.exception(
- f"Error deleting chunks from database for collection '{collection_name}'"
- )
- return 0
-
- def _load_or_create_vector_store(self):
- """Load the vector store from disk or create it if needed"""
- vector_store_path = self._get_vector_store_path()
-
- # Check if vector store exists and is up to date
- if vector_store_path.exists() and not self._check_folders_modified():
- logger.info(
- f"Loading existing vector store from {vector_store_path}"
- )
- try:
- vector_store = FAISS.load_local(
- str(vector_store_path),
- self.embeddings,
- allow_dangerous_deserialization=True,
- )
-
- # Add this code to show document count
- doc_count = len(vector_store.index_to_docstore_id)
- logger.info(f"Loaded index with {doc_count} document chunks")
-
- return vector_store
- except Exception:
- logger.exception("Error loading vector store")
- logger.info("Will create a new vector store")
-
- # Create a new vector store
- return self._create_vector_store()
-
- def _load_indexed_folders(self) -> Dict[str, Dict[str, Any]]:
- """Load metadata about indexed folders from disk"""
- index_metadata_path = self.cache_dir / "index_metadata.json"
-
- if index_metadata_path.exists():
- try:
- with open(index_metadata_path, "r") as f:
- return json.load(f)
- except Exception:
- logger.exception("Error loading index metadata")
-
- return {}
-
- def _save_indexed_folders(self):
- """Save metadata about indexed folders to disk"""
- index_metadata_path = self.cache_dir / "index_metadata.json"
-
- try:
- with open(index_metadata_path, "w") as f:
- json.dump(self.indexed_folders, f, indent=2)
- except Exception:
- logger.exception("Error saving index metadata")
-
- @staticmethod
- def get_folder_hash(folder_path: Path) -> str:
- """Generate a hash for a folder based on its path"""
- # Canonicalize the path so we don't have weird Windows vs. Linux
- # problems or issues with trailing slashes.
- canonical_folder_path = "/".join(folder_path.parts)
- return hashlib.md5( # DevSkim: ignore DS126858
- canonical_folder_path.encode(), usedforsecurity=False
- ).hexdigest()
-
- def _get_index_path(self, folder_path: Path) -> Path:
- """Get the path where the index for a specific folder should be stored"""
- folder_hash = self.get_folder_hash(folder_path)
- return self.cache_dir / f"index_{folder_hash}"
-
- def _check_folder_modified(self, folder_path: Path) -> bool:
- """Check if a folder has been modified since it was last indexed"""
-
- @staticmethod
- def _get_all_files(folder_path: Path) -> Iterable[Path]:
- """
- Gets all the files, recursively, in a folder.
-
- Args:
- folder_path: The path to the folder.
-
- Yields:
- Each of the files in the folder.
-
- """
- for root, _, files in os.walk(folder_path):
- for file in files:
- yield Path(root) / file
-
- def _get_modified_files(self, folder_path: Path) -> List[Path]:
- """
- Gets the files in a folder that have been modified since it was last
- indexed.
-
- Args:
- folder_path: The path to the folder to check.
-
- Returns:
- A list of the files that were modified.
-
- """
- if not folder_path.exists() or not folder_path.is_dir():
- return []
-
- folder_hash = self.get_folder_hash(folder_path)
-
- if folder_hash not in self.indexed_folders:
- # If folder has never been indexed, everything has been modified.
- last_indexed = 0
- indexed_files = set()
- else:
- last_indexed = self.indexed_folders[folder_hash].get(
- "last_indexed", 0
- )
- indexed_files = (
- self.indexed_folders[folder_hash]
- .get("indexed_files", {})
- .keys()
- )
-
- # Check if any file in the folder has been modified since last indexing
- modified_files = []
- for file_path in self._get_all_files(folder_path):
- file_stats = file_path.stat()
- if file_stats.st_mtime > last_indexed:
- modified_files.append(file_path)
- elif str(file_path.relative_to(folder_path)) not in indexed_files:
- # This file somehow never got indexed.
- modified_files.append(file_path)
-
- return modified_files
-
- def _check_config_changed(self, folder_path: Path) -> bool:
- """
- Checks if the embedding configuration for a folder has been changed
- since it was last indexed.
- """
- folder_hash = self.get_folder_hash(folder_path)
-
- if folder_hash not in self.indexed_folders:
- # It hasn't been indexed at all. That's a new configuration,
- # technically.
- return True
-
- embedding_config = self.indexed_folders[folder_hash]
- chunk_size = int(embedding_config.get("chunk_size", 0))
- chunk_overlap = int(embedding_config.get("chunk_overlap", 0))
- embedding_model = embedding_config.get("embedding_model", "")
-
- if (chunk_size, chunk_overlap, embedding_model) != (
- self.chunk_size,
- self.chunk_overlap,
- self.embedding_model,
- ):
- logger.info(
- "Embedding configuration has changed, re-indexing folder."
- )
- return True
- return False
-
- def index_folder(
- self, folder_path: str, force_reindex: bool = False
- ) -> bool:
- """
- Index all documents in a folder for vector search.
-
- Args:
- folder_path: Path to the folder to index
- force_reindex: Whether to force reindexing even if unchanged
-
- Returns:
- bool: True if indexing was successful, False otherwise
- """
- folder_path = Path(folder_path)
-
- # Validate folder
- if not folder_path.exists():
- logger.error(f"Folder not found: {folder_path}")
- return False
-
- if not folder_path.is_dir():
- logger.error(f"Path is not a directory: {folder_path}")
- return False
-
- folder_str = str(folder_path)
- folder_hash = self.get_folder_hash(folder_path)
- index_path = self._get_index_path(folder_path)
-
- if force_reindex or self._check_config_changed(folder_path):
- logger.info(f"Re-indexing entire folder: {folder_path}")
- modified_files = list(self._get_all_files(folder_path))
- else:
- # Just re-index the modified files if we can get away with it.
- modified_files = self._get_modified_files(folder_path)
- logger.info(f"Re-indexing {len(modified_files)} modified files...")
-
- # Load the vector store from disk if not already loaded
- if folder_hash not in self.vector_stores and index_path.exists():
- try:
- self.vector_stores[folder_hash] = FAISS.load_local(
- str(index_path),
- self.embeddings,
- allow_dangerous_deserialization=True,
- )
- logger.info(f"Loaded index for {folder_path} from disk")
- except Exception:
- logger.exception(f"Error loading index for {folder_path}")
- # If loading fails, force reindexing
- force_reindex = True
-
- logger.info(f"Indexing folder: {folder_path}")
- start_time = time.time()
-
- # Find documents to index
- all_docs = []
-
- # Remove hidden files and directories.
- modified_files = [
- p
- for p in modified_files
- if not p.name.startswith(".")
- and not any(part.startswith(".") for part in p.parts)
- ]
- # Index them.
- with ProcessPoolExecutor() as executor:
- all_docs_nested = executor.map(_load_document, modified_files)
- # Flatten the result.
- for docs in all_docs_nested:
- all_docs.extend(docs)
-
- if force_reindex or folder_hash not in self.vector_stores:
- logger.info(f"Creating new index for {folder_path}")
- # Embed a test query to figure out embedding length.
- test_embedding = self.embeddings.embed_query("hello world")
- index = IndexFlatL2(len(test_embedding))
- # Use minimal docstore - chunks are stored in database
- self.vector_stores[folder_hash] = FAISS(
- self.embeddings,
- index=index,
- docstore=InMemoryDocstore(), # Minimal - just for FAISS compatibility
- index_to_docstore_id={},
- normalize_L2=True,
- )
-
- # Split documents into chunks
- logger.info(f"Splitting {len(all_docs)} documents into chunks")
- splits = self.text_splitter.split_documents(all_docs)
- logger.info(
- f"Created {len(splits)} chunks from {len(modified_files)} files"
- )
-
- # Store chunks in database and get embedding IDs
- embedding_ids = []
- if splits:
- logger.info(f"Storing {len(splits)} chunks in database")
- # Get collection name from folder path (last folder name)
- collection_name = folder_path.name
-
- # Store chunks to database
- embedding_ids = self._store_chunks_to_db(
- chunks=splits,
- collection_name=collection_name,
- source_type="local_file",
- )
-
- logger.info(f"Adding {len(splits)} chunks to FAISS index")
- # Add embeddings to FAISS using the database-generated IDs
- self.vector_stores[folder_hash].add_documents(
- splits, ids=embedding_ids
- )
-
- # Update indexing time for individual files.
- index_time = time.time()
- indexed_files = {}
- if folder_hash in self.indexed_folders:
- indexed_files = (
- self.indexed_folders[folder_hash]
- .get("indexed_files", {})
- .copy()
- )
- for embedding_id, split in zip(embedding_ids, splits, strict=False):
- split_source = str(
- Path(split.metadata["source"]).relative_to(folder_path)
- )
- id_list = indexed_files.setdefault(split_source, [])
- id_list.append(embedding_id)
-
- # Check for any files that were removed and remove them from the
- # vector store and database.
- delete_ids = []
- delete_paths = []
- for relative_path, chunk_ids in indexed_files.items():
- if not (folder_path / Path(relative_path)).exists():
- delete_ids.extend(chunk_ids)
- delete_paths.append(relative_path)
- if delete_ids:
- logger.info(
- f"Deleting {len(delete_paths)} non-existent files from the "
- f"index and database."
- )
- # Delete from FAISS index
- self.vector_stores[folder_hash].delete(delete_ids)
-
- # Delete from database
- collection_name = folder_path.name
- for delete_path in delete_paths:
- full_path = str(folder_path / delete_path)
- deleted_count = self._delete_chunks_from_db(
- collection_name=collection_name,
- source_path=full_path,
- )
- logger.debug(
- f"Deleted {deleted_count} chunks for {delete_path} from database"
- )
- for path in delete_paths:
- del indexed_files[path]
-
- # Save the vector store to disk
- logger.info(f"Saving index to {index_path}")
- self.vector_stores[folder_hash].save_local(str(index_path))
-
- # Update metadata
- self.indexed_folders[folder_hash] = {
- "path": folder_str,
- "last_indexed": index_time,
- "file_count": len(modified_files),
- "chunk_count": len(splits),
- "embedding_model": self.embedding_model,
- "chunk_size": self.chunk_size,
- "chunk_overlap": self.chunk_overlap,
- "indexed_files": indexed_files,
- }
-
- # Save updated metadata
- self._save_indexed_folders()
-
- elapsed_time = time.time() - start_time
- logger.info(
- f"Indexed {len(modified_files)} files in {elapsed_time:.2f} seconds"
- )
-
- return True
-
- def search(
- self,
- query: str,
- folder_paths: List[str],
- limit: int = 10,
- score_threshold: float = 0.0,
- ) -> List[Dict[str, Any]]:
- """
- Search for documents relevant to a query across specified folders.
-
- Args:
- query: The search query
- folder_paths: List of folder paths to search in
- limit: Maximum number of results to return
- score_threshold: Minimum similarity score threshold
-
- Returns:
- List of results with document content and metadata
- """
- folder_paths = [Path(p) for p in folder_paths]
-
- # Add detailed debugging for each folder
- for folder_path in folder_paths:
- folder_hash = self.get_folder_hash(folder_path)
- index_path = self._get_index_path(folder_path)
-
- logger.info(f"Diagnostic for {folder_path}:")
- logger.info(f" - Folder hash: {folder_hash}")
- logger.info(f" - Index path: {index_path}")
- logger.info(f" - Index exists on disk: {index_path.exists()}")
- logger.info(
- f" - Is in indexed_folders: {folder_hash in self.indexed_folders}"
- )
-
- if folder_hash in self.indexed_folders:
- meta = self.indexed_folders[folder_hash]
- logger.info(
- f" - Metadata: file_count={meta.get('file_count', 0)}, chunk_count={meta.get('chunk_count', 0)}"
- )
-
- # Validate folders exist
- valid_folder_paths = []
- for path in folder_paths:
- if path.exists() and path.is_dir():
- valid_folder_paths.append(path)
- else:
- logger.warning(
- f"Skipping non-existent folder in search: {path}"
- )
-
- # If no valid folders, return empty results
- if not valid_folder_paths:
- logger.warning(f"No valid folders to search among: {folder_paths}")
- return []
-
- all_results = []
-
- for folder_path in valid_folder_paths:
- folder_hash = self.get_folder_hash(folder_path)
-
- # Skip folders that haven't been indexed
- if folder_hash not in self.indexed_folders:
- logger.warning(f"Folder {folder_path} has not been indexed")
- continue
-
- # Make sure the vector store is loaded
- if folder_hash not in self.vector_stores:
- index_path = self._get_index_path(folder_path)
- try:
- self.vector_stores[folder_hash] = FAISS.load_local(
- str(index_path),
- self.embeddings,
- allow_dangerous_deserialization=True,
- )
- except Exception:
- logger.exception(f"Error loading index for {folder_path}")
- continue
-
- # Search in this folder
- vector_store = self.vector_stores[folder_hash]
-
- try:
- # Get query embedding
- query_vector = self.embeddings.embed_query(query)
-
- # Search FAISS index for similar vectors
- # Returns: (distances, indices) where indices are FAISS internal indices
- distances, indices = vector_store.index.search(
- np.array([query_vector], dtype=np.float32), limit
- )
-
- # Convert distances to similarity scores (L2 distance -> similarity)
- # For L2: smaller distance = more similar
- # Convert to similarity: 1 / (1 + distance)
- similarities = 1 / (1 + distances[0])
-
- # Get embedding IDs from FAISS mapping
- embedding_ids = []
- valid_indices = []
- for idx, faiss_idx in enumerate(indices[0]):
- if faiss_idx == -1: # FAISS returns -1 for empty results
- continue
- if faiss_idx in vector_store.index_to_docstore_id:
- embedding_id = vector_store.index_to_docstore_id[
- faiss_idx
- ]
- embedding_ids.append(embedding_id)
- valid_indices.append(idx)
-
- # Load chunks from database
- if embedding_ids:
- db_chunks = self._load_chunks_from_db(
- embedding_ids, self.username
- )
-
- # Create results from database chunks
- for idx, chunk in zip(valid_indices, db_chunks):
- similarity = float(similarities[idx])
-
- # Skip results below the threshold
- if similarity < score_threshold:
- continue
-
- # Extract metadata from chunk
- metadata = chunk.get("document_metadata", {})
- if "source" not in metadata and chunk.get(
- "source_path"
- ):
- metadata["source"] = chunk["source_path"]
-
- result = {
- "content": chunk["chunk_text"],
- "metadata": metadata,
- "similarity": similarity,
- "folder": folder_path,
- }
-
- all_results.append(result)
- except Exception:
- logger.exception(f"Error searching in {folder_path}")
-
- # Sort by similarity (highest first)
- all_results.sort(key=lambda x: x["similarity"], reverse=True)
-
- # Limit to the requested number
- return all_results[:limit]
-
- def clear_cache(self):
- """Clear all cached vector stores from memory (not disk)"""
- self.vector_stores.clear()
-
- def get_indexed_folders_info(self) -> List[Dict[str, Any]]:
- """Get information about all indexed folders"""
- info = []
-
- for folder_hash, metadata in self.indexed_folders.items():
- folder_info = metadata.copy()
-
- # Add formatted last indexed time
- if "last_indexed" in folder_info:
- folder_info["last_indexed_formatted"] = datetime.fromtimestamp(
- folder_info["last_indexed"]
- ).strftime("%Y-%m-%d %H:%M:%S")
-
- # Check if index file exists
- index_path = self._get_index_path(Path(folder_info["path"]))
- folder_info["index_exists"] = index_path.exists()
-
- info.append(folder_info)
-
- return info
-
-
-class LocalSearchEngine(BaseSearchEngine):
- """Local document search engine with two-phase retrieval"""
-
- def __init__(
- self,
- paths: List[str],
- llm: Optional[BaseLLM] = None,
- max_results: int = 10,
- max_filtered_results: Optional[int] = None,
- embedding_model: str = "all-MiniLM-L6-v2",
- embedding_device: str = "cpu",
- embedding_model_type: str = "sentence_transformers",
- ollama_base_url: Optional[str] = None,
- force_reindex: bool = False,
- chunk_size: int = 1000,
- chunk_overlap: int = 200,
- cache_dir: Optional[str] = None,
- collections: Optional[Dict[str, Dict[str, Any]]] = None,
- name: str = "",
- description: str = "",
- ):
- """
- Initialize the local search engine.
-
- Args:
- paths: List of folder paths to search in
- llm: Language model for relevance filtering
- max_results: Maximum number of results to return
- max_filtered_results: Maximum results after filtering
- embedding_model: Name of the embedding model to use
- embedding_device: Device to run embeddings on ('cpu' or 'cuda')
- embedding_model_type: Type of embedding model
- ollama_base_url: Base URL for Ollama API
- force_reindex: Whether to force reindexing
- chunk_size: Size of text chunks for splitting documents
- chunk_overlap: Overlap between chunks
- cache_dir: Directory to store embedding cache and index
- collections: Dictionary of named collections with paths and descriptions
- name: Human-readable name of the collection we are searching.
- description: Human-readable description of the collection we are
- searching.
- """
- # Initialize the base search engine
- super().__init__(llm=llm, max_filtered_results=max_filtered_results)
-
- self.name = name
- self.description = description
-
- # Validate folder paths
- self.folder_paths = paths
- self.valid_folder_paths = []
- for path_str in paths:
- path = Path(path_str)
- if path.exists() and path.is_dir():
- self.valid_folder_paths.append(path_str)
- else:
- logger.warning(
- f"Folder not found or is not a directory: {path_str}"
- )
-
- # If no valid folders, log a clear message
- if not self.valid_folder_paths and paths:
- logger.warning(f"No valid folders found among: {paths}")
- logger.warning(
- "This search engine will return no results until valid folders are configured"
- )
-
- self.max_results = max_results
- self.collections = collections or {
- "default": {"paths": paths, "description": "Default collection"}
- }
-
- # Initialize the embedding manager with only valid folders
- self.embedding_manager = LocalEmbeddingManager(
- embedding_model=embedding_model,
- embedding_device=embedding_device,
- embedding_model_type=embedding_model_type,
- ollama_base_url=ollama_base_url,
- chunk_size=chunk_size,
- chunk_overlap=chunk_overlap,
- cache_dir=cache_dir,
- settings_snapshot=self.settings_snapshot,
- )
-
- # Index all folders
- self._index_folders(force_reindex)
-
- def _index_folders(self, force_reindex: bool = False):
- """Index all valid configured folders"""
- indexed = []
- failed = []
- skipped = []
-
- # Keep track of invalid folders
- for folder in self.folder_paths:
- if folder not in self.valid_folder_paths:
- skipped.append(folder)
- continue
-
- success = self.embedding_manager.index_folder(folder, force_reindex)
- if success:
- indexed.append(folder)
- else:
- failed.append(folder)
-
- if indexed:
- logger.info(
- f"Successfully indexed {len(indexed)} folders: {', '.join(indexed)}"
- )
-
- if failed:
- logger.warning(
- f"Failed to index {len(failed)} folders: {', '.join(failed)}"
- )
-
- if skipped:
- logger.warning(
- f"Skipped {len(skipped)} invalid folders: {', '.join(skipped)}"
- )
-
- def _get_previews(
- self, query: str, collection_names: Optional[List[str]] = None
- ) -> List[Dict[str, Any]]:
- """
- Get preview information for documents matching the query.
-
- Args:
- query: The search query
- collection_names: Specific collections to search within (if None, search all)
-
- Returns:
- List of preview dictionaries
- """
- # Determine which collections to search
- if collection_names:
- # Search only in specified collections
- collections_to_search = {
- name: self.collections[name]
- for name in collection_names
- if name in self.collections
- }
- if not collections_to_search:
- logger.warning(
- f"No valid collections found among: {collection_names}"
- )
- return []
- else:
- # Search in all collections
- collections_to_search = self.collections
-
- # Extract all folder paths from the collections to search
- search_paths = []
- for collection_config in collections_to_search.values():
- if "paths" in collection_config:
- search_paths.extend(collection_config["paths"])
-
- logger.info(
- f"Searching local documents in collections: {list(collections_to_search.keys())}"
- )
-
- # Filter out invalid paths
- valid_search_paths = [
- path for path in search_paths if path in self.valid_folder_paths
- ]
-
- if not valid_search_paths:
- logger.warning(
- f"No valid folders to search in collections: {list(collections_to_search.keys())}"
- )
- return []
-
- # Search across the valid selected folders
- raw_results = self.embedding_manager.search(
- query=query,
- folder_paths=valid_search_paths,
- limit=self.max_results,
- score_threshold=0.1, # Skip very low relevance results
- )
-
- if not raw_results:
- logger.info(f"No local documents found for query: {query}")
- return []
-
- # Convert to preview format
- previews = []
- for i, result in enumerate(raw_results):
- # Create a unique ID
- result_id = f"local-{i}-{hashlib.md5(result['content'][:50].encode(), usedforsecurity=False).hexdigest()}" # DevSkim: ignore DS126858
-
- # Extract filename and path
- source_path = result["metadata"].get("source", "Unknown")
- filename = result["metadata"].get(
- "filename", Path(source_path).name
- )
-
- # Create preview snippet (first ~200 chars of content)
- snippet = (
- result["content"][:200] + "..."
- if len(result["content"]) > 200
- else result["content"]
- )
-
- # Determine which collection this document belongs to
- collection_name = "Unknown"
- folder_path = result["folder"]
- for name, collection in self.collections.items():
- if any(
- folder_path.is_relative_to(path)
- for path in collection.get("paths", [])
- ):
- break
-
- # Format the preview
- preview = {
- "id": result_id,
- "title": filename,
- "snippet": snippet,
- "link": source_path,
- "similarity": result["similarity"],
- "folder": folder_path.as_posix(),
- "collection": collection_name,
- "collection_description": self.collections.get(
- collection_name, {}
- ).get("description", ""),
- "_full_content": result[
- "content"
- ], # Store full content for later
- "_metadata": result["metadata"], # Store metadata for later
- }
-
- previews.append(preview)
-
- logger.info(f"Found {len(previews)} local document matches")
- return previews
-
- def _get_full_content(
- self, relevant_items: List[Dict[str, Any]]
- ) -> List[Dict[str, Any]]:
- """
- Get full content for the relevant documents.
- For local search, the full content is already available.
-
- Args:
- relevant_items: List of relevant preview dictionaries
-
- Returns:
- List of result dictionaries with full content
- """
- # Check if we should add full content
- if (
- hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
- and search_config.SEARCH_SNIPPETS_ONLY
- ):
- logger.info("Snippet-only mode, skipping full content addition")
- return relevant_items
-
- # For local search, we already have the full content
- results = []
- for item in relevant_items:
- # Create a copy with full content
- result = item.copy()
-
- # Add full content if we have it
- if "_full_content" in item:
- result["content"] = item["_full_content"]
- result["full_content"] = item["_full_content"]
-
- # Remove temporary fields
- if "_full_content" in result:
- del result["_full_content"]
-
- # Add metadata if we have it
- if "_metadata" in item:
- result["document_metadata"] = item["_metadata"]
-
- # Remove temporary fields
- if "_metadata" in result:
- del result["_metadata"]
-
- results.append(result)
-
- return results
-
- def run(
- self,
- query: str,
- research_context: Dict[str, Any] | None = None,
- collection_names: Optional[List[str]] = None,
- ) -> List[Dict[str, Any]]:
- """
- Execute a search using the two-phase approach.
-
- Args:
- query: The search query
- research_context: Context from previous research to use.
- collection_names: Specific collections to search within (if None, search all)
-
- Returns:
- List of search result dictionaries with full content
- """
- logger.info("---Execute a search using Local Documents---")
-
- # Check if we have any special collection parameters in the query
- collection_prefix = "collection:"
- remaining_query = query
- specified_collections = []
-
- # Parse query for collection specifications like "collection:research_papers query terms"
- query_parts = query.split()
- for part in query_parts:
- if part.lower().startswith(collection_prefix):
- collection_name = part[len(collection_prefix) :].strip()
- if collection_name in self.collections:
- specified_collections.append(collection_name)
- # Remove this part from the query
- remaining_query = remaining_query.replace(
- part, "", 1
- ).strip()
-
- # If collections were specified in the query, they override the parameter
- if specified_collections:
- collection_names = specified_collections
- query = remaining_query
-
- # Phase 1: Get previews (with collection filtering)
- previews = self._get_previews(query, collection_names)
-
- if not previews:
- return []
-
- # Phase 2: Filter for relevance
- relevant_items = self._filter_for_relevance(previews, query)
-
- if not relevant_items:
- return []
-
- # Phase 3: Get full content for relevant items
- if (
- hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
- and search_config.SEARCH_SNIPPETS_ONLY
- ):
- logger.info("Returning snippet-only results as per config")
- results = relevant_items
- else:
- results = self._get_full_content(relevant_items)
-
- # Clean up temporary data
- self.embedding_manager.clear_cache()
-
- return results
-
- def get_collections_info(self) -> List[Dict[str, Any]]:
- """
- Get information about all collections, including indexing status.
-
- Returns:
- List of collection information dictionaries
- """
- collections_info = []
-
- for name, collection in self.collections.items():
- paths = collection.get("paths", [])
- paths = [Path(p) for p in paths]
- description = collection.get("description", "")
-
- # Get indexing information for each path
- paths_info = []
- for path in paths:
- # Check if folder exists
- exists = path.exists() and path.is_dir()
-
- # Check if folder is indexed
- folder_hash = self.embedding_manager.get_folder_hash(path)
- indexed = folder_hash in self.embedding_manager.indexed_folders
-
- # Get index details if available
- index_info = {}
- if indexed:
- index_info = self.embedding_manager.indexed_folders[
- folder_hash
- ].copy()
-
- paths_info.append(
- {
- "path": path,
- "exists": exists,
- "indexed": indexed,
- "index_info": index_info,
- }
- )
-
- collections_info.append(
- {
- "name": name,
- "description": description,
- "paths": paths,
- "paths_info": paths_info,
- "document_count": sum(
- info.get("index_info", {}).get("file_count", 0)
- for info in paths_info
- ),
- "chunk_count": sum(
- info.get("index_info", {}).get("chunk_count", 0)
- for info in paths_info
- ),
- "all_indexed": all(
- info["indexed"] for info in paths_info if info["exists"]
- ),
- }
- )
-
- return collections_info
-
- def reindex_collection(self, collection_name: str) -> bool:
- """
- Reindex a specific collection.
-
- Args:
- collection_name: Name of the collection to reindex
-
- Returns:
- True if reindexing was successful, False otherwise
- """
- if collection_name not in self.collections:
- logger.error(f"Collection '{collection_name}' not found")
- return False
-
- paths = self.collections[collection_name].get("paths", [])
- success = True
-
- for path in paths:
- if not self.embedding_manager.index_folder(
- path, force_reindex=True
- ):
- success = False
-
- return success
-
- @classmethod
- def from_config(
- cls, config_dict: Dict[str, Any], llm: Optional[BaseLLM] = None
- ) -> "LocalSearchEngine":
- """
- Create a LocalSearchEngine instance from a configuration dictionary.
-
- Args:
- config_dict: Configuration dictionary
- llm: Language model for relevance filtering
-
- Returns:
- Initialized LocalSearchEngine instance
- """
- # Required parameters
- folder_paths = []
- collections = config_dict.get("collections", {})
-
- # Extract all folder paths from collections
- for collection_config in collections.values():
- if "paths" in collection_config:
- folder_paths.extend(collection_config["paths"])
-
- # Fall back to folder_paths if no collections defined
- if not folder_paths:
- folder_paths = config_dict.get("folder_paths", [])
- # Create a default collection if using folder_paths
- if folder_paths:
- collections = {
- "default": {
- "paths": folder_paths,
- "description": "Default collection",
- }
- }
-
- # Optional parameters with defaults
- max_results = config_dict.get("max_results", 10)
- max_filtered_results = config_dict.get("max_filtered_results")
- embedding_model = config_dict.get("embedding_model", "all-MiniLM-L6-v2")
- embedding_device = config_dict.get("embedding_device", "cpu")
- embedding_model_type = config_dict.get(
- "embedding_model_type", "sentence_transformers"
- )
- ollama_base_url = config_dict.get("ollama_base_url")
- force_reindex = config_dict.get("force_reindex", False)
- chunk_size = config_dict.get("chunk_size", 1000)
- chunk_overlap = config_dict.get("chunk_overlap", 200)
- cache_dir = config_dict.get(
- "cache_dir"
- ) # None uses app's cache directory
-
- return cls(
- paths=folder_paths,
- collections=collections,
- llm=llm,
- max_results=max_results,
- max_filtered_results=max_filtered_results,
- embedding_model=embedding_model,
- embedding_device=embedding_device,
- embedding_model_type=embedding_model_type,
- ollama_base_url=ollama_base_url,
- force_reindex=force_reindex,
- chunk_size=chunk_size,
- chunk_overlap=chunk_overlap,
- cache_dir=cache_dir,
- )
diff --git a/src/local_deep_research/web_search_engines/engines/search_engine_local_all.py b/src/local_deep_research/web_search_engines/engines/search_engine_local_all.py
deleted file mode 100644
index adec1d59d..000000000
--- a/src/local_deep_research/web_search_engines/engines/search_engine_local_all.py
+++ /dev/null
@@ -1,170 +0,0 @@
-"""
-Search engine that searches across all local collections
-"""
-
-from typing import Any, Dict, List, Optional, cast
-
-from langchain_core.language_models import BaseLLM
-from loguru import logger
-
-from ..search_engine_base import BaseSearchEngine
-from ..search_engine_factory import create_search_engine
-from ..search_engines_config import local_search_engines
-from .search_engine_local import LocalSearchEngine
-
-
-class LocalAllSearchEngine(BaseSearchEngine):
- """
- Search engine that searches across all local document collections.
- Acts as a meta search engine specifically for local collections.
- """
-
- def __init__(
- self,
- llm: Optional[BaseLLM] = None,
- max_results: int = 10,
- max_filtered_results: Optional[int] = None,
- settings_snapshot: Optional[Dict[str, Any]] = None,
- programmatic_mode: bool = False,
- **kwargs,
- ):
- """
- Initialize the local all-collections search engine.
-
- Args:
- llm: Language model for relevance filtering
- max_results: Maximum number of search results
- max_filtered_results: Maximum results after filtering
- settings_snapshot: Settings snapshot for thread context
- programmatic_mode: If True, disables database operations and metrics tracking
- **kwargs: Additional parameters passed to LocalSearchEngine instances
- """
- # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
- super().__init__(
- llm=llm,
- max_filtered_results=max_filtered_results,
- max_results=max_results,
- settings_snapshot=settings_snapshot,
- programmatic_mode=programmatic_mode,
- )
-
- # Find all local collection search engines
- self.local_engines = {}
- try:
- for collection_id in local_search_engines():
- # Create a search engine for this collection
- try:
- engine = create_search_engine(
- collection_id,
- llm=llm,
- max_filtered_results=max_filtered_results,
- settings_snapshot=settings_snapshot,
- programmatic_mode=programmatic_mode,
- )
- engine = cast(LocalSearchEngine, engine)
-
- if engine:
- self.local_engines[collection_id] = {
- "engine": engine,
- "name": engine.name,
- "description": engine.description,
- }
- except Exception:
- logger.exception(
- f"Error creating search engine for collection '{collection_id}'"
- )
- except ImportError:
- logger.warning("No local collections configuration found")
-
- def _get_previews(self, query: str) -> List[Dict[str, Any]]:
- """
- Get preview information for documents from all local collections.
-
- Args:
- query: The search query
-
- Returns:
- List of preview dictionaries
- """
- logger.info(
- f"Searching across all local collections for query: {query}"
- )
-
- all_previews = []
-
- # Get previews from each local search engine
- for collection_id, engine_info in self.local_engines.items():
- engine = engine_info["engine"]
- try:
- # Get previews from this engine
- previews = engine._get_previews(query)
-
- # Add collection info to each preview
- for preview in previews:
- preview["collection_id"] = collection_id
- preview["collection_name"] = engine_info["name"]
- preview["collection_description"] = engine_info[
- "description"
- ]
-
- all_previews.extend(previews)
- except Exception:
- logger.exception(
- f"Error searching collection '{collection_id}'"
- )
-
- if not all_previews:
- logger.info(f"No local documents found for query: {query}")
- return []
-
- # Sort by similarity score if available
- all_previews.sort(
- key=lambda x: float(x.get("similarity", 0)), reverse=True
- )
-
- # Limit to max_results
- return all_previews[: self.max_results]
-
- def _get_full_content(
- self, relevant_items: List[Dict[str, Any]]
- ) -> List[Dict[str, Any]]:
- """
- Get full content for the relevant documents.
- Delegates to the appropriate collection's search engine.
-
- Args:
- relevant_items: List of relevant preview dictionaries
-
- Returns:
- List of result dictionaries with full content
- """
- # Group items by collection
- items_by_collection = {}
- for item in relevant_items:
- collection_id = item.get("collection_id")
- if collection_id and collection_id in self.local_engines:
- if collection_id not in items_by_collection:
- items_by_collection[collection_id] = []
- items_by_collection[collection_id].append(item)
-
- # Process each collection's items with its own engine
- all_results = []
- for collection_id, items in items_by_collection.items():
- engine = self.local_engines[collection_id]["engine"]
- try:
- results = engine._get_full_content(items)
- all_results.extend(results)
- except Exception:
- logger.exception(
- f"Error getting full content from collection '{collection_id}'"
- )
- # Fall back to returning the items without full content
- all_results.extend(items)
-
- # Add any items that weren't processed
- processed_ids = set(item["id"] for item in all_results)
- for item in relevant_items:
- if item["id"] not in processed_ids:
- all_results.append(item)
-
- return all_results
diff --git a/src/local_deep_research/web_search_engines/rate_limiting/tracker.py b/src/local_deep_research/web_search_engines/rate_limiting/tracker.py
index 1d860e058..c83a417fa 100644
--- a/src/local_deep_research/web_search_engines/rate_limiting/tracker.py
+++ b/src/local_deep_research/web_search_engines/rate_limiting/tracker.py
@@ -261,7 +261,6 @@ class AdaptiveRateLimitTracker:
# First time seeing this engine - start optimistic and learn from real responses
# Use engine-specific optimistic defaults only for what we know for sure
optimistic_defaults = {
- "LocalSearchEngine": 0.0, # No network calls
"SearXNGSearchEngine": 0.1, # Self-hosted default engine
}
diff --git a/src/local_deep_research/web_search_engines/search_engines_config.py b/src/local_deep_research/web_search_engines/search_engines_config.py
index 560f3070a..4234495f8 100644
--- a/src/local_deep_research/web_search_engines/search_engines_config.py
+++ b/src/local_deep_research/web_search_engines/search_engines_config.py
@@ -3,8 +3,7 @@ Configuration file for search engines.
Loads search engine definitions from the user's configuration.
"""
-import json
-from typing import Any, Dict, List, Optional
+from typing import Any, Dict, Optional
from sqlalchemy.orm import Session
from loguru import logger
@@ -164,66 +163,6 @@ def search_config(
if "auto" in search_engines and "meta" not in search_engines:
search_engines["meta"] = search_engines["auto"]
- # Register local document collections
- local_collections_data = (
- _get_setting(
- "search.engine.local",
- {},
- db_session=db_session,
- settings_snapshot=settings_snapshot,
- username=username,
- )
- or {}
- )
- local_collections_data = _extract_per_engine_config(local_collections_data)
-
- for collection, config in local_collections_data.items():
- if not config.get("enabled", True):
- # Search engine is not enabled. Ignore.
- logger.info(f"Ignoring disabled local collection '{collection}'.")
- continue
-
- if "paths" in config and isinstance(config["paths"], str):
- # This will be saved as a json array.
- try:
- config["paths"] = json.loads(config["paths"])
- except json.decoder.JSONDecodeError:
- logger.exception(
- f"Path for local collection '{collection}' is not a valid JSON array: "
- f"{config['paths']}"
- )
- config["paths"] = []
-
- # Create a new dictionary with required search engine fields
- engine_config = {
- "default_params": config,
- "requires_llm": True,
- }
- engine_config_prefix = f"search.engine.local.{collection}"
- engine_config["module_path"] = _get_setting(
- f"{engine_config_prefix}.module_path",
- ".engines.search_engine_local",
- db_session=db_session,
- settings_snapshot=settings_snapshot,
- username=username,
- )
- engine_config["class_name"] = _get_setting(
- f"{engine_config_prefix}.class_name",
- "LocalSearchEngine",
- db_session=db_session,
- settings_snapshot=settings_snapshot,
- username=username,
- )
-
- # Copy these specific fields to the top level if they exist
- for field in ["strengths", "weaknesses", "reliability", "description"]:
- if field in config:
- engine_config[field] = config[field]
-
- search_engines[collection] = engine_config
-
- logger.info("Registered local document collections as search engines")
-
# Register Library RAG as a search engine
library_enabled = _get_setting(
"search.engine.library.enabled",
@@ -338,45 +277,3 @@ def default_search_engine(
settings_snapshot=settings_snapshot,
username=username,
)
-
-
-def local_search_engines(
- username: Optional[str] = None,
- db_session: Optional[Session] = None,
- settings_snapshot: Optional[Dict[str, Any]] = None,
-) -> List[str]:
- """
- Returns a list of the enabled local search engines.
-
- Args:
- username: Username for backward compatibility (deprecated)
- db_session: Database session for direct access (preferred for web routes)
- settings_snapshot: Settings snapshot for thread context (preferred for background threads)
-
- Returns:
- A list of the enabled local search engines.
- """
- local_collections_data = (
- _get_setting(
- "search.engine.local",
- {},
- db_session=db_session,
- settings_snapshot=settings_snapshot,
- username=username,
- )
- or {}
- )
- local_collections_data = _extract_per_engine_config(local_collections_data)
-
- # Don't include the `local_all` collection.
- local_collections_data.pop("local_all", None)
- # Remove disabled collections.
- local_collections_data = {
- k: v
- for k, v in local_collections_data.items()
- if v.get("enabled", True)
- }
-
- enabled_collections = list(local_collections_data.keys())
- logger.debug(f"Using local collections: {enabled_collections}")
- return enabled_collections
diff --git a/tests/rate_limiting/test_rate_limiting.py b/tests/rate_limiting/test_rate_limiting.py
index 1c8bafdc6..51826f3fb 100644
--- a/tests/rate_limiting/test_rate_limiting.py
+++ b/tests/rate_limiting/test_rate_limiting.py
@@ -48,7 +48,6 @@ class TestAdaptiveRateLimitTracker(unittest.TestCase):
"TestEngine_GetStats",
"TestEngine_Reset",
"SearXNGSearchEngine",
- "LocalSearchEngine",
]
for engine in test_engines:
try:
@@ -66,7 +65,6 @@ class TestAdaptiveRateLimitTracker(unittest.TestCase):
"TestEngine_GetStats",
"TestEngine_Reset",
"SearXNGSearchEngine",
- "LocalSearchEngine",
]
for engine in test_engines:
try:
@@ -84,7 +82,6 @@ class TestAdaptiveRateLimitTracker(unittest.TestCase):
try:
self.tracker.reset_engine("TestEngine")
self.tracker.reset_engine("SearXNGSearchEngine")
- self.tracker.reset_engine("LocalSearchEngine")
except:
pass
@@ -99,13 +96,6 @@ class TestAdaptiveRateLimitTracker(unittest.TestCase):
searxng_wait = self.tracker.get_wait_time("SearXNGSearchEngine")
self.assertEqual(searxng_wait, 0.1) # Very optimistic for self-hosted
- # Test Local search (no network)
- # Clear from current estimates to force default
- if "LocalSearchEngine" in self.tracker.current_estimates:
- del self.tracker.current_estimates["LocalSearchEngine"]
- local_wait = self.tracker.get_wait_time("LocalSearchEngine")
- self.assertEqual(local_wait, 0.0) # No wait for local search
-
@pytest.mark.timeout(30)
def test_record_outcome_and_learning(self):
"""Test recording outcomes and learning from them.
diff --git a/tests/research_library/routes/test_rag_routes.py b/tests/research_library/routes/test_rag_routes.py
index caf291367..7d13f856b 100644
--- a/tests/research_library/routes/test_rag_routes.py
+++ b/tests/research_library/routes/test_rag_routes.py
@@ -2540,7 +2540,7 @@ class TestBackgroundThreadSettingsManagerUsage:
mock_rag.return_value = Mock()
with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
+ "local_deep_research.web_search_engines.engines.local_embedding_manager.LocalEmbeddingManager"
):
try:
_get_rag_service_for_thread(
diff --git a/tests/security/test_absolute_module_paths_hook.py b/tests/security/test_absolute_module_paths_hook.py
index 25c24fa10..e29735801 100644
--- a/tests/security/test_absolute_module_paths_hook.py
+++ b/tests/security/test_absolute_module_paths_hook.py
@@ -63,7 +63,7 @@ x = "local_deep_research.some_new_package.foo.bar"
def test_detects_multiple_violations(self):
"""Should detect multiple violations in one file."""
code = """
-a = "local_deep_research.web_search_engines.engines.search_engine_local"
+a = "local_deep_research.web_search_engines.engines.local_embedding_manager"
b = "local_deep_research.web_search_engines.engines.search_engine_brave"
c = "local_deep_research.llm.providers.implementations.openai_provider"
"""
diff --git a/tests/settings/golden_master_settings.json b/tests/settings/golden_master_settings.json
index f76fa5850..6705f2301 100644
--- a/tests/settings/golden_master_settings.json
+++ b/tests/settings/golden_master_settings.json
@@ -2169,906 +2169,6 @@
],
"visible": true
},
- "search.engine.local.local_all.class_name": {
- "category": "local_all",
- "description": "Internal: Python class implementing local document search. Do not modify.",
- "editable": false,
- "max_value": null,
- "min_value": null,
- "name": "Class Name",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "LocalAllSearchEngine",
- "visible": false
- },
- "search.engine.local.local_all.description": {
- "category": "local_all",
- "description": "Human-readable description of the search engine.",
- "editable": false,
- "max_value": null,
- "min_value": null,
- "name": "Description",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Search only local documents using RAG.",
- "visible": false
- },
- "search.engine.local.local_all.display_name": {
- "category": "local_all",
- "description": "Display name to use in the U.I. for this search engine.",
- "editable": false,
- "max_value": null,
- "min_value": null,
- "name": "Display Name",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Local Documents",
- "visible": false
- },
- "search.engine.local.local_all.module_path": {
- "category": "local_all",
- "description": "Internal: Python module path for local search implementation. Do not modify.",
- "editable": false,
- "max_value": null,
- "min_value": null,
- "name": "Module Path",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": ".engines.search_engine_local_all",
- "visible": false
- },
- "search.engine.local.local_all.reliability": {
- "category": "local_all",
- "description": "Reliability score (0-1) for local search. Quality depends on your document collection and indexing.",
- "editable": true,
- "max_value": 1.0,
- "min_value": 0.0,
- "name": "Reliability",
- "options": null,
- "step": 0.05,
- "type": "SEARCH",
- "ui_element": "range",
- "value": 0.85,
- "visible": true
- },
- "search.engine.local.local_all.requires_api_key": {
- "category": "local_all",
- "description": "Local document search does not require any external API keys.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Requires Api Key",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": false,
- "visible": true
- },
- "search.engine.local.local_all.requires_llm": {
- "category": "local_all",
- "description": "Indicates this engine uses the LLM to rerank and filter results for relevance.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Requires Llm",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": true,
- "visible": true
- },
- "search.engine.local.local_all.strengths": {
- "category": "local_all",
- "description": "Advantages: Searches all local document collections at once, works offline, uses your private documents.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Strengths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "searches all local collections",
- "personal documents",
- "offline access"
- ],
- "visible": true
- },
- "search.engine.local.local_all.use_in_auto_search": {
- "category": "local_all",
- "description": "Include local documents in auto search mode",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Include in Auto Search",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": true,
- "visible": true
- },
- "search.engine.local.local_all.weaknesses": {
- "category": "local_all",
- "description": "Limitations: May return too many results from mixed collections, requires documents to be indexed first.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Weaknesses",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "may return too many results",
- "requires indexing"
- ],
- "visible": true
- },
- "search.engine.local.personal_notes.cache_dir": {
- "category": "personal_notes",
- "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Cache Dir",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": null,
- "visible": true
- },
- "search.engine.local.personal_notes.chunk_overlap": {
- "category": "personal_notes",
- "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.",
- "editable": true,
- "max_value": null,
- "min_value": 0,
- "name": "Chunk Overlap",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 100,
- "visible": true
- },
- "search.engine.local.personal_notes.chunk_size": {
- "category": "personal_notes",
- "description": "Maximum characters per chunk when splitting documents for RAG indexing. Smaller = more precise, larger = more context.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Chunk Size",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 500,
- "visible": true
- },
- "search.engine.local.personal_notes.description": {
- "category": "personal_notes",
- "description": "Human-readable description of this document collection shown in the UI.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Description",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Personal notes and documents",
- "visible": true
- },
- "search.engine.local.personal_notes.embedding_device": {
- "category": "personal_notes",
- "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Device",
- "options": [
- {
- "label": "CPU",
- "value": "cpu"
- },
- {
- "label": "CUDA",
- "value": "cuda"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "cpu",
- "visible": true
- },
- "search.engine.local.personal_notes.embedding_model": {
- "category": "personal_notes",
- "description": "Model for generating text embeddings. Default 'all-MiniLM-L6-v2' is fast and works well; larger models may improve accuracy.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "all-MiniLM-L6-v2",
- "visible": true
- },
- "search.engine.local.personal_notes.embedding_model_type": {
- "category": "personal_notes",
- "description": "Model provider to use for generating document embeddings.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model Type",
- "options": [
- {
- "label": "SentenceTransformers",
- "value": "sentence_transformers"
- },
- {
- "label": "Ollama",
- "value": "ollama"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "sentence_transformers",
- "visible": true
- },
- "search.engine.local.personal_notes.enabled": {
- "category": "personal_notes",
- "description": "Enable this document collection for searching. Disable if you don't want to index these documents.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Enabled",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": true,
- "visible": true
- },
- "search.engine.local.personal_notes.max_filtered_results": {
- "category": "personal_notes",
- "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Max Filtered Results",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": 10,
- "visible": true
- },
- "search.engine.local.personal_notes.max_results": {
- "category": "personal_notes",
- "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Max Results",
- "options": null,
- "step": 1,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 30,
- "visible": true
- },
- "search.engine.local.personal_notes.name": {
- "category": "personal_notes",
- "description": "Internal identifier for this collection. Used in logs and configuration.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Name",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Personal Notes",
- "visible": true
- },
- "search.engine.local.personal_notes.paths": {
- "category": "personal_notes",
- "description": "File paths to include in this collection. Supports directories (recursively indexed) and individual files.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Paths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "/local_collections/personal_notes"
- ],
- "visible": true
- },
- "search.engine.local.personal_notes.reliability": {
- "category": "personal_notes",
- "description": "Reliability score (0-1). Personal notes are rated lower (0.75) as they may contain informal or subjective content.",
- "editable": true,
- "max_value": 1.0,
- "min_value": 0.0,
- "name": "Reliability",
- "options": null,
- "step": 0.05,
- "type": "SEARCH",
- "ui_element": "range",
- "value": 0.75,
- "visible": true
- },
- "search.engine.local.personal_notes.strengths": {
- "category": "personal_notes",
- "description": "Advantages: Access to your personal knowledge, notes, and private documents not available elsewhere.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Strengths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "personal knowledge",
- "notes",
- "private documents"
- ],
- "visible": true
- },
- "search.engine.local.personal_notes.use_in_auto_search": {
- "category": "personal_notes",
- "description": "Include personal notes in auto search mode",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Include in Auto Search",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": false,
- "visible": true
- },
- "search.engine.local.personal_notes.weaknesses": {
- "category": "personal_notes",
- "description": "Limitations: Content may be subjective, informal, or incomplete compared to published sources.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Weaknesses",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "subjective content",
- "informal information"
- ],
- "visible": true
- },
- "search.engine.local.project_docs.cache_dir": {
- "category": "project_docs",
- "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Cache Dir",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": null,
- "visible": true
- },
- "search.engine.local.project_docs.chunk_overlap": {
- "category": "project_docs",
- "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.",
- "editable": true,
- "max_value": null,
- "min_value": 0,
- "name": "Chunk Overlap",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 200,
- "visible": true
- },
- "search.engine.local.project_docs.chunk_size": {
- "category": "project_docs",
- "description": "Maximum characters per chunk when splitting documents for RAG indexing. Larger default (1000) suits technical documentation.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Chunk Size",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 1000,
- "visible": true
- },
- "search.engine.local.project_docs.description": {
- "category": "project_docs",
- "description": "Human-readable description of this document collection shown in the UI.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Description",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Project documentation and specifications",
- "visible": true
- },
- "search.engine.local.project_docs.embedding_device": {
- "category": "project_docs",
- "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Device",
- "options": [
- {
- "label": "CPU",
- "value": "cpu"
- },
- {
- "label": "CUDA",
- "value": "cuda"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "cpu",
- "visible": true
- },
- "search.engine.local.project_docs.embedding_model": {
- "category": "project_docs",
- "description": "Model for generating text embeddings. Default 'all-MiniLM-L6-v2' is fast and works well; larger models may improve accuracy.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "all-MiniLM-L6-v2",
- "visible": true
- },
- "search.engine.local.project_docs.embedding_model_type": {
- "category": "project_docs",
- "description": "Model provider to use for generating document embeddings.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model Type",
- "options": [
- {
- "label": "SentenceTransformers",
- "value": "sentence_transformers"
- },
- {
- "label": "Ollama",
- "value": "ollama"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "sentence_transformers",
- "visible": true
- },
- "search.engine.local.project_docs.enabled": {
- "category": "project_docs",
- "description": "Enable this document collection for searching. Disable if you don't want to index these documents.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Enabled",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": true,
- "visible": true
- },
- "search.engine.local.project_docs.max_filtered_results": {
- "category": "project_docs",
- "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Max Filtered Results",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": 5,
- "visible": true
- },
- "search.engine.local.project_docs.max_results": {
- "category": "project_docs",
- "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Max Results",
- "options": null,
- "step": 1,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 20,
- "visible": true
- },
- "search.engine.local.project_docs.name": {
- "category": "project_docs",
- "description": "Internal identifier for this collection. Used in logs and configuration.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Name",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Project Documents",
- "visible": true
- },
- "search.engine.local.project_docs.paths": {
- "category": "project_docs",
- "description": "File paths to include in this collection. Supports directories (recursively indexed) and individual files.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Paths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "/local_collections/project_docs/"
- ],
- "visible": true
- },
- "search.engine.local.project_docs.reliability": {
- "category": "project_docs",
- "description": "Reliability score (0-1). Project docs rated moderately (0.8) as they are semi-formal technical content.",
- "editable": true,
- "max_value": 1.0,
- "min_value": 0.0,
- "name": "Reliability",
- "options": null,
- "step": 0.05,
- "type": "SEARCH",
- "ui_element": "range",
- "value": 0.9,
- "visible": true
- },
- "search.engine.local.project_docs.strengths": {
- "category": "project_docs",
- "description": "Advantages: Access to project-specific technical docs, READMEs, and internal documentation not available online.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Strengths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "project documentation",
- "specifications",
- "internal documents"
- ],
- "visible": true
- },
- "search.engine.local.project_docs.use_in_auto_search": {
- "category": "project_docs",
- "description": "Include project documents in auto search mode",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Include in Auto Search",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": false,
- "visible": true
- },
- "search.engine.local.project_docs.weaknesses": {
- "category": "project_docs",
- "description": "Limitations: May be outdated if docs not maintained, limited scope to specific projects.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Weaknesses",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "no external information",
- "limited to organizational knowledge"
- ],
- "visible": true
- },
- "search.engine.local.research_papers.cache_dir": {
- "category": "research_papers",
- "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Cache Dir",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": null,
- "visible": true
- },
- "search.engine.local.research_papers.chunk_overlap": {
- "category": "research_papers",
- "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.",
- "editable": true,
- "max_value": null,
- "min_value": 0,
- "name": "Chunk Overlap",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 150,
- "visible": true
- },
- "search.engine.local.research_papers.chunk_size": {
- "category": "research_papers",
- "description": "Maximum characters per chunk when splitting papers for RAG indexing. Default (800) balances context and precision.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Chunk Size",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 800,
- "visible": true
- },
- "search.engine.local.research_papers.description": {
- "category": "research_papers",
- "description": "Human-readable description of this document collection shown in the UI.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Description",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Academic research papers and articles",
- "visible": true
- },
- "search.engine.local.research_papers.embedding_device": {
- "category": "research_papers",
- "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Device",
- "options": [
- {
- "label": "CPU",
- "value": "cpu"
- },
- {
- "label": "CUDA",
- "value": "cuda"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "cpu",
- "visible": true
- },
- "search.engine.local.research_papers.embedding_model": {
- "category": "research_papers",
- "description": "Model for generating text embeddings. Consider 'allenai/specter' for academic papers if available.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "all-MiniLM-L6-v2",
- "visible": true
- },
- "search.engine.local.research_papers.embedding_model_type": {
- "category": "research_papers",
- "description": "Model provider to use for generating document embeddings.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Embedding Model Type",
- "options": [
- {
- "label": "SentenceTransformers",
- "value": "sentence_transformers"
- },
- {
- "label": "Ollama",
- "value": "ollama"
- }
- ],
- "step": null,
- "type": "SEARCH",
- "ui_element": "select",
- "value": "sentence_transformers",
- "visible": true
- },
- "search.engine.local.research_papers.enabled": {
- "category": "research_papers",
- "description": "Enable this document collection for searching. Disable if you don't have local research papers.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Enabled",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": true,
- "visible": true
- },
- "search.engine.local.research_papers.max_filtered_results": {
- "category": "research_papers",
- "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Max Filtered Results",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": 5,
- "visible": true
- },
- "search.engine.local.research_papers.max_results": {
- "category": "research_papers",
- "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.",
- "editable": true,
- "max_value": null,
- "min_value": 1,
- "name": "Max Results",
- "options": null,
- "step": 1,
- "type": "SEARCH",
- "ui_element": "number",
- "value": 20,
- "visible": true
- },
- "search.engine.local.research_papers.name": {
- "category": "research_papers",
- "description": "Internal identifier for this collection. Used in logs and configuration.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Name",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "text",
- "value": "Research Papers",
- "visible": true
- },
- "search.engine.local.research_papers.paths": {
- "category": "research_papers",
- "description": "File paths containing academic papers. Supports PDFs and text formats; directories are indexed recursively.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Paths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "/local_collections/research_papers/"
- ],
- "visible": true
- },
- "search.engine.local.research_papers.reliability": {
- "category": "research_papers",
- "description": "Reliability score (0-1). Research papers rated high (0.95) as they are peer-reviewed academic content.",
- "editable": true,
- "max_value": 1.0,
- "min_value": 0.0,
- "name": "Reliability",
- "options": null,
- "step": 0.05,
- "type": "SEARCH",
- "ui_element": "range",
- "value": 0.85,
- "visible": true
- },
- "search.engine.local.research_papers.strengths": {
- "category": "research_papers",
- "description": "Advantages: Access to peer-reviewed academic content, scientific papers, and scholarly research in your collection.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Strengths",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "academic research",
- "scientific papers",
- "scholarly content"
- ],
- "visible": true
- },
- "search.engine.local.research_papers.use_in_auto_search": {
- "category": "research_papers",
- "description": "Include research papers in auto search mode",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Include in Auto Search",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "checkbox",
- "value": false,
- "visible": true
- },
- "search.engine.local.research_papers.weaknesses": {
- "category": "research_papers",
- "description": "Limitations: Limited to papers in your collection, may be outdated if not regularly updated.",
- "editable": true,
- "max_value": null,
- "min_value": null,
- "name": "Weaknesses",
- "options": null,
- "step": null,
- "type": "SEARCH",
- "ui_element": "json",
- "value": [
- "potentially outdated",
- "limited to collected papers"
- ],
- "visible": true
- },
"search.engine.web.arxiv.class_name": {
"category": "arxiv",
"description": "Internal: Python class implementing the arXiv search engine.",
diff --git a/tests/web_search_engines/engines/test_local_embedding_manager.py b/tests/web_search_engines/engines/test_local_embedding_manager.py
new file mode 100644
index 000000000..0114e36c7
--- /dev/null
+++ b/tests/web_search_engines/engines/test_local_embedding_manager.py
@@ -0,0 +1,96 @@
+"""
+Tests for the LocalEmbeddingManager class.
+
+Tests cover:
+- LocalEmbeddingManager initialization and configuration
+- Embeddings lazy initialization
+"""
+
+from unittest.mock import Mock, patch
+
+
+class TestLocalEmbeddingManagerInit:
+ """Tests for LocalEmbeddingManager initialization."""
+
+ def test_init_with_defaults(self):
+ """Initialize with default values."""
+ from local_deep_research.web_search_engines.engines.local_embedding_manager import (
+ LocalEmbeddingManager,
+ )
+
+ manager = LocalEmbeddingManager()
+
+ assert manager.embedding_model == "all-MiniLM-L6-v2"
+ assert manager.embedding_device == "cpu"
+ assert manager.embedding_model_type == "sentence_transformers"
+ assert manager._embeddings is None # Lazy initialization
+
+ def test_init_with_ollama(self):
+ """Initialize with Ollama embeddings."""
+ from local_deep_research.web_search_engines.engines.local_embedding_manager import (
+ LocalEmbeddingManager,
+ )
+
+ manager = LocalEmbeddingManager(
+ embedding_model_type="ollama",
+ embedding_model="llama2",
+ ollama_base_url="http://localhost:11434",
+ )
+
+ assert manager.embedding_model_type == "ollama"
+ assert manager.embedding_model == "llama2"
+ assert manager.ollama_base_url == "http://localhost:11434"
+
+ def test_init_with_settings_snapshot(self):
+ """Initialize with settings snapshot."""
+ from local_deep_research.web_search_engines.engines.local_embedding_manager import (
+ LocalEmbeddingManager,
+ )
+
+ settings = {"_username": "testuser"}
+ manager = LocalEmbeddingManager(settings_snapshot=settings)
+
+ assert manager.username == "testuser"
+ assert manager.settings_snapshot == settings
+
+
+class TestLocalEmbeddingManagerEmbeddings:
+ """Tests for LocalEmbeddingManager embeddings property."""
+
+ def test_embeddings_lazy_initialization(self):
+ """Embeddings are lazily initialized."""
+ from local_deep_research.web_search_engines.engines.local_embedding_manager import (
+ LocalEmbeddingManager,
+ )
+
+ manager = LocalEmbeddingManager()
+
+ assert manager._embeddings is None
+
+ # Mock the embeddings initialization
+ mock_embeddings = Mock()
+ with patch.object(
+ manager, "_initialize_embeddings", return_value=mock_embeddings
+ ):
+ embeddings = manager.embeddings
+
+ assert embeddings is mock_embeddings
+ assert manager._embeddings is mock_embeddings
+
+ def test_embeddings_reuse(self):
+ """Embeddings are reused after initialization."""
+ from local_deep_research.web_search_engines.engines.local_embedding_manager import (
+ LocalEmbeddingManager,
+ )
+
+ manager = LocalEmbeddingManager()
+
+ mock_embeddings = Mock()
+ manager._embeddings = mock_embeddings
+
+ # Should return existing embeddings without reinitializing
+ with patch.object(manager, "_initialize_embeddings") as mock_init:
+ embeddings = manager.embeddings
+
+ assert embeddings is mock_embeddings
+ mock_init.assert_not_called()
diff --git a/tests/web_search_engines/engines/test_search_engine_local.py b/tests/web_search_engines/engines/test_search_engine_local.py
deleted file mode 100644
index b16ee01e7..000000000
--- a/tests/web_search_engines/engines/test_search_engine_local.py
+++ /dev/null
@@ -1,1073 +0,0 @@
-"""
-Tests for the LocalSearchEngine and LocalEmbeddingManager classes.
-
-Tests cover:
-- Helper functions (_get_file_loader, _load_document)
-- LocalEmbeddingManager initialization and methods
-- LocalSearchEngine initialization and methods
-- Search functionality
-- Folder indexing
-"""
-
-import json
-from pathlib import Path
-from unittest.mock import Mock, patch
-
-
-class TestGetFileLoader:
- """Tests for _get_file_loader helper function."""
-
- def test_get_file_loader_pdf(self, tmp_path):
- """Get file loader for PDF files."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _get_file_loader,
- )
-
- pdf_file = tmp_path / "test.pdf"
- pdf_file.touch()
-
- mock_loader = Mock()
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path",
- return_value=mock_loader,
- ) as mock_get_loader:
- result = _get_file_loader(str(pdf_file))
- mock_get_loader.assert_called_once_with(str(pdf_file))
- assert result is mock_loader
-
- def test_get_file_loader_txt(self, tmp_path):
- """Get file loader for text files."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _get_file_loader,
- )
-
- txt_file = tmp_path / "test.txt"
- txt_file.touch()
-
- mock_loader = Mock()
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path",
- return_value=mock_loader,
- ) as mock_get_loader:
- result = _get_file_loader(str(txt_file))
- mock_get_loader.assert_called_once_with(str(txt_file))
- assert result is mock_loader
-
- def test_get_file_loader_markdown(self, tmp_path):
- """Get file loader for markdown files."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _get_file_loader,
- )
-
- md_file = tmp_path / "test.md"
- md_file.touch()
-
- mock_loader = Mock()
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path",
- return_value=mock_loader,
- ) as mock_get_loader:
- result = _get_file_loader(str(md_file))
- mock_get_loader.assert_called_once_with(str(md_file))
- assert result is mock_loader
-
- def test_get_file_loader_docx(self, tmp_path):
- """Get file loader for Word documents."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _get_file_loader,
- )
-
- docx_file = tmp_path / "test.docx"
- docx_file.touch()
-
- mock_loader = Mock()
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path",
- return_value=mock_loader,
- ) as mock_get_loader:
- result = _get_file_loader(str(docx_file))
- mock_get_loader.assert_called_once_with(str(docx_file))
- assert result is mock_loader
-
- def test_get_file_loader_csv(self, tmp_path):
- """Get file loader for CSV files."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _get_file_loader,
- )
-
- csv_file = tmp_path / "test.csv"
- csv_file.touch()
-
- mock_loader = Mock()
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path",
- return_value=mock_loader,
- ) as mock_get_loader:
- result = _get_file_loader(str(csv_file))
- mock_get_loader.assert_called_once_with(str(csv_file))
- assert result is mock_loader
-
- def test_get_file_loader_xlsx(self, tmp_path):
- """Get file loader for Excel files."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _get_file_loader,
- )
-
- xlsx_file = tmp_path / "test.xlsx"
- xlsx_file.touch()
-
- mock_loader = Mock()
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path",
- return_value=mock_loader,
- ) as mock_get_loader:
- result = _get_file_loader(str(xlsx_file))
- mock_get_loader.assert_called_once_with(str(xlsx_file))
- assert result is mock_loader
-
- def test_get_file_loader_unknown_extension(self, tmp_path):
- """Get file loader for unknown extension falls back to TextLoader."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _get_file_loader,
- )
-
- unknown_file = tmp_path / "test.xyz"
- unknown_file.touch()
-
- # When extension is not supported, get_loader_for_path returns None
- # and _get_file_loader falls back to TextLoader
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.is_extension_supported",
- return_value=False,
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.TextLoader"
- ) as mock_text_loader:
- _get_file_loader(str(unknown_file))
- mock_text_loader.assert_called_once_with(
- str(unknown_file),
- encoding="utf-8",
- autodetect_encoding=True,
- )
-
- def test_get_file_loader_exception(self, tmp_path):
- """Get file loader handles exceptions."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _get_file_loader,
- )
-
- pdf_file = tmp_path / "test.pdf"
- pdf_file.touch()
-
- # When get_loader_for_path raises an exception, it should return None
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.is_extension_supported",
- return_value=True,
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path",
- return_value=None,
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.TextLoader",
- side_effect=Exception("Loader error"),
- ):
- loader = _get_file_loader(str(pdf_file))
- assert loader is None
-
-
-class TestLoadDocument:
- """Tests for _load_document helper function."""
-
- def test_load_document_success(self, tmp_path):
- """Load document successfully."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _load_document,
- )
- from langchain_core.documents import Document
-
- txt_file = tmp_path / "test.txt"
- txt_file.write_text("Test content")
-
- mock_doc = Document(page_content="Test content", metadata={})
- mock_loader = Mock()
- mock_loader.load.return_value = [mock_doc]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local._get_file_loader",
- return_value=mock_loader,
- ):
- docs = _load_document(txt_file)
-
- assert len(docs) == 1
- assert docs[0].page_content == "Test content"
- assert docs[0].metadata["source"] == str(txt_file)
- assert docs[0].metadata["filename"] == "test.txt"
-
- def test_load_document_no_loader(self, tmp_path):
- """Load document with no available loader."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _load_document,
- )
-
- file_path = tmp_path / "test.xyz"
- file_path.touch()
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local._get_file_loader",
- return_value=None,
- ):
- docs = _load_document(file_path)
- assert docs == []
-
- def test_load_document_exception(self, tmp_path):
- """Load document handles exceptions."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- _load_document,
- )
-
- txt_file = tmp_path / "test.txt"
- txt_file.touch()
-
- mock_loader = Mock()
- mock_loader.load.side_effect = Exception("Load error")
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local._get_file_loader",
- return_value=mock_loader,
- ):
- docs = _load_document(txt_file)
- assert docs == []
-
-
-class TestLocalEmbeddingManagerInit:
- """Tests for LocalEmbeddingManager initialization."""
-
- def test_init_with_defaults(self, tmp_path):
- """Initialize with default values."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_cache_directory",
- return_value=tmp_path,
- ):
- manager = LocalEmbeddingManager()
-
- assert manager.embedding_model == "all-MiniLM-L6-v2"
- assert manager.embedding_device == "cpu"
- assert manager.embedding_model_type == "sentence_transformers"
- assert manager.chunk_size == 1000
- assert manager.chunk_overlap == 200
- assert manager._embeddings is None # Lazy initialization
-
- def test_init_with_custom_cache_dir(self, tmp_path):
- """Initialize with custom cache directory."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- custom_cache = tmp_path / "custom_cache"
- manager = LocalEmbeddingManager(cache_dir=str(custom_cache))
-
- assert manager.cache_dir == custom_cache
- assert custom_cache.exists()
-
- def test_init_with_ollama(self, tmp_path):
- """Initialize with Ollama embeddings."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(
- embedding_model_type="ollama",
- embedding_model="llama2",
- ollama_base_url="http://localhost:11434",
- cache_dir=str(tmp_path),
- )
-
- assert manager.embedding_model_type == "ollama"
- assert manager.embedding_model == "llama2"
- assert manager.ollama_base_url == "http://localhost:11434"
-
- def test_init_with_settings_snapshot(self, tmp_path):
- """Initialize with settings snapshot."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- settings = {"_username": "testuser"}
- manager = LocalEmbeddingManager(
- settings_snapshot=settings, cache_dir=str(tmp_path)
- )
-
- assert manager.username == "testuser"
- assert manager.settings_snapshot == settings
-
-
-class TestLocalEmbeddingManagerEmbeddings:
- """Tests for LocalEmbeddingManager embeddings property."""
-
- def test_embeddings_lazy_initialization(self, tmp_path):
- """Embeddings are lazily initialized."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
-
- assert manager._embeddings is None
-
- # Mock the embeddings initialization
- mock_embeddings = Mock()
- with patch.object(
- manager, "_initialize_embeddings", return_value=mock_embeddings
- ):
- embeddings = manager.embeddings
-
- assert embeddings is mock_embeddings
- assert manager._embeddings is mock_embeddings
-
- def test_embeddings_reuse(self, tmp_path):
- """Embeddings are reused after initialization."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
-
- mock_embeddings = Mock()
- manager._embeddings = mock_embeddings
-
- # Should return existing embeddings without reinitializing
- with patch.object(manager, "_initialize_embeddings") as mock_init:
- embeddings = manager.embeddings
-
- assert embeddings is mock_embeddings
- mock_init.assert_not_called()
-
-
-class TestLocalEmbeddingManagerIndexedFolders:
- """Tests for LocalEmbeddingManager indexed folders management."""
-
- def test_load_indexed_folders_empty(self, tmp_path):
- """Load indexed folders when no metadata exists."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
-
- assert manager.indexed_folders == {}
-
- def test_load_indexed_folders_from_disk(self, tmp_path):
- """Load indexed folders from disk."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- # Create metadata file
- metadata = {
- "abc123": {
- "path": "/test/folder",
- "last_indexed": 1234567890,
- "file_count": 10,
- }
- }
- metadata_file = tmp_path / "index_metadata.json"
- metadata_file.write_text(json.dumps(metadata))
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
-
- assert "abc123" in manager.indexed_folders
- assert manager.indexed_folders["abc123"]["path"] == "/test/folder"
-
- def test_save_indexed_folders(self, tmp_path):
- """Save indexed folders to disk."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
- manager.indexed_folders = {
- "xyz789": {
- "path": "/another/folder",
- "last_indexed": 9876543210,
- }
- }
-
- manager._save_indexed_folders()
-
- metadata_file = tmp_path / "index_metadata.json"
- assert metadata_file.exists()
-
- saved_data = json.loads(metadata_file.read_text())
- assert "xyz789" in saved_data
-
-
-class TestLocalEmbeddingManagerFolderHash:
- """Tests for LocalEmbeddingManager folder hash methods."""
-
- def test_get_folder_hash(self, tmp_path):
- """Get folder hash is deterministic."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- folder_path = tmp_path / "test_folder"
- folder_path.mkdir()
-
- hash1 = LocalEmbeddingManager.get_folder_hash(folder_path)
- hash2 = LocalEmbeddingManager.get_folder_hash(folder_path)
-
- assert hash1 == hash2
- assert len(hash1) == 32 # MD5 hash length
-
- def test_get_folder_hash_different_folders(self, tmp_path):
- """Different folders have different hashes."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- folder1 = tmp_path / "folder1"
- folder2 = tmp_path / "folder2"
- folder1.mkdir()
- folder2.mkdir()
-
- hash1 = LocalEmbeddingManager.get_folder_hash(folder1)
- hash2 = LocalEmbeddingManager.get_folder_hash(folder2)
-
- assert hash1 != hash2
-
- def test_get_index_path(self, tmp_path):
- """Get index path for a folder."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
-
- folder_path = Path("/test/folder")
- index_path = manager._get_index_path(folder_path)
-
- assert "index_" in str(index_path)
- assert index_path.parent == tmp_path
-
-
-class TestLocalEmbeddingManagerGetAllFiles:
- """Tests for LocalEmbeddingManager _get_all_files method."""
-
- def test_get_all_files(self, tmp_path):
- """Get all files in a folder recursively."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- # Create test folder structure
- (tmp_path / "file1.txt").touch()
- (tmp_path / "file2.txt").touch()
- subdir = tmp_path / "subdir"
- subdir.mkdir()
- (subdir / "file3.txt").touch()
-
- files = list(LocalEmbeddingManager._get_all_files(tmp_path))
-
- assert len(files) == 3
- filenames = [f.name for f in files]
- assert "file1.txt" in filenames
- assert "file2.txt" in filenames
- assert "file3.txt" in filenames
-
-
-class TestLocalEmbeddingManagerCheckConfigChanged:
- """Tests for LocalEmbeddingManager config change detection."""
-
- def test_check_config_changed_new_folder(self, tmp_path):
- """Config changed is True for new folder."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
-
- folder = tmp_path / "new_folder"
- folder.mkdir()
-
- assert manager._check_config_changed(folder) is True
-
- def test_check_config_changed_same_config(self, tmp_path):
- """Config changed is False when config is the same."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
-
- folder = tmp_path / "test_folder"
- folder.mkdir()
-
- folder_hash = manager.get_folder_hash(folder)
- manager.indexed_folders[folder_hash] = {
- "chunk_size": 1000,
- "chunk_overlap": 200,
- "embedding_model": "all-MiniLM-L6-v2",
- }
-
- assert manager._check_config_changed(folder) is False
-
- def test_check_config_changed_different_chunk_size(self, tmp_path):
- """Config changed is True when chunk size differs."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
-
- folder = tmp_path / "test_folder"
- folder.mkdir()
-
- folder_hash = manager.get_folder_hash(folder)
- manager.indexed_folders[folder_hash] = {
- "chunk_size": 500, # Different from default 1000
- "chunk_overlap": 200,
- "embedding_model": "all-MiniLM-L6-v2",
- }
-
- assert manager._check_config_changed(folder) is True
-
-
-class TestLocalEmbeddingManagerClearCache:
- """Tests for LocalEmbeddingManager clear_cache method."""
-
- def test_clear_cache(self, tmp_path):
- """Clear cache removes vector stores from memory."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
- manager.vector_stores = {"hash1": Mock(), "hash2": Mock()}
-
- manager.clear_cache()
-
- assert manager.vector_stores == {}
-
-
-class TestLocalEmbeddingManagerGetIndexedFoldersInfo:
- """Tests for LocalEmbeddingManager get_indexed_folders_info method."""
-
- def test_get_indexed_folders_info_empty(self, tmp_path):
- """Get indexed folders info when no folders indexed."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
-
- info = manager.get_indexed_folders_info()
-
- assert info == []
-
- def test_get_indexed_folders_info_with_folders(self, tmp_path):
- """Get indexed folders info with indexed folders."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(cache_dir=str(tmp_path))
-
- test_folder = tmp_path / "test_folder"
- test_folder.mkdir()
-
- folder_hash = manager.get_folder_hash(test_folder)
- manager.indexed_folders[folder_hash] = {
- "path": str(test_folder),
- "last_indexed": 1234567890,
- "file_count": 5,
- "chunk_count": 20,
- }
-
- info = manager.get_indexed_folders_info()
-
- assert len(info) == 1
- assert info[0]["path"] == str(test_folder)
- assert info[0]["file_count"] == 5
- assert "last_indexed_formatted" in info[0]
-
-
-class TestLocalSearchEngineInit:
- """Tests for LocalSearchEngine initialization."""
-
- def test_init_with_valid_paths(self, tmp_path):
- """Initialize with valid folder paths."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- # Mock embedding manager to avoid actual initialization
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
-
- engine = LocalSearchEngine(
- paths=[str(folder)],
- name="Test Collection",
- description="Test description",
- )
-
- assert str(folder) in engine.valid_folder_paths
- assert engine.name == "Test Collection"
- assert engine.description == "Test description"
-
- def test_init_with_invalid_paths(self, tmp_path):
- """Initialize with invalid folder paths."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- invalid_path = str(tmp_path / "nonexistent")
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ):
- engine = LocalSearchEngine(paths=[invalid_path])
-
- assert invalid_path not in engine.valid_folder_paths
- assert engine.valid_folder_paths == []
-
- def test_init_with_custom_max_results(self, tmp_path):
- """Initialize with custom max_results."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
-
- engine = LocalSearchEngine(paths=[str(folder)], max_results=50)
-
- assert engine.max_results == 50
-
- def test_init_with_collections(self, tmp_path):
- """Initialize with named collections."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder1 = tmp_path / "research"
- folder2 = tmp_path / "notes"
- folder1.mkdir()
- folder2.mkdir()
-
- collections = {
- "research": {
- "paths": [str(folder1)],
- "description": "Research papers",
- },
- "notes": {"paths": [str(folder2)], "description": "Personal notes"},
- }
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
-
- engine = LocalSearchEngine(
- paths=[str(folder1), str(folder2)], collections=collections
- )
-
- assert "research" in engine.collections
- assert "notes" in engine.collections
-
-
-class TestLocalSearchEngineGetPreviews:
- """Tests for LocalSearchEngine _get_previews method."""
-
- def test_get_previews_returns_results(self, tmp_path):
- """Get previews returns formatted results."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- mock_results = [
- {
- "content": "Test content for document one",
- "metadata": {
- "source": str(folder / "doc1.txt"),
- "filename": "doc1.txt",
- },
- "similarity": 0.95,
- "folder": folder,
- }
- ]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
- mock_manager.return_value.search.return_value = mock_results
-
- engine = LocalSearchEngine(paths=[str(folder)])
- previews = engine._get_previews("test query")
-
- assert len(previews) == 1
- assert previews[0]["title"] == "doc1.txt"
- assert previews[0]["similarity"] == 0.95
-
- def test_get_previews_empty_results(self, tmp_path):
- """Get previews handles empty results."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
- mock_manager.return_value.search.return_value = []
-
- engine = LocalSearchEngine(paths=[str(folder)])
- previews = engine._get_previews("test query")
-
- assert previews == []
-
- def test_get_previews_no_valid_folders(self, tmp_path):
- """Get previews returns empty for no valid folders."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ):
- engine = LocalSearchEngine(paths=["/nonexistent/path"])
- previews = engine._get_previews("test query")
-
- assert previews == []
-
-
-class TestLocalSearchEngineGetFullContent:
- """Tests for LocalSearchEngine _get_full_content method."""
-
- def test_get_full_content_preserves_content(self, tmp_path):
- """Get full content preserves full content from items."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- items = [
- {
- "id": "local-1",
- "title": "Doc 1",
- "_full_content": "This is the full content of document 1",
- "_metadata": {"source": "/path/to/doc1.txt"},
- }
- ]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.search_config"
- ) as mock_config:
- mock_config.SEARCH_SNIPPETS_ONLY = False
-
- engine = LocalSearchEngine(paths=[str(folder)])
- results = engine._get_full_content(items)
-
- assert len(results) == 1
- assert (
- results[0]["full_content"]
- == "This is the full content of document 1"
- )
- assert "_full_content" not in results[0]
-
- def test_get_full_content_snippets_only(self, tmp_path):
- """Get full content respects snippets-only mode."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- items = [
- {
- "id": "local-1",
- "title": "Doc 1",
- "_full_content": "Full content",
- }
- ]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.search_config"
- ) as mock_config:
- mock_config.SEARCH_SNIPPETS_ONLY = True
-
- engine = LocalSearchEngine(paths=[str(folder)])
- results = engine._get_full_content(items)
-
- # In snippets-only mode, items are returned as-is
- assert results == items
-
-
-class TestLocalSearchEngineRun:
- """Tests for LocalSearchEngine run method."""
-
- def test_run_returns_results(self, tmp_path):
- """Run returns search results."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- mock_results = [
- {
- "content": "Test content",
- "metadata": {
- "source": str(folder / "doc.txt"),
- "filename": "doc.txt",
- },
- "similarity": 0.9,
- "folder": folder,
- }
- ]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
- mock_manager.return_value.search.return_value = mock_results
- mock_manager.return_value.clear_cache = Mock()
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.search_config"
- ) as mock_config:
- mock_config.SEARCH_SNIPPETS_ONLY = False
-
- engine = LocalSearchEngine(paths=[str(folder)])
-
- # Mock _filter_for_relevance to return all items
- with patch.object(
- engine, "_filter_for_relevance", side_effect=lambda x, q: x
- ):
- results = engine.run("test query")
-
- assert len(results) >= 1
-
- def test_run_with_collection_filter(self, tmp_path):
- """Run with collection filter in query."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- collections = {
- "research": {
- "paths": [str(folder)],
- "description": "Research papers",
- },
- }
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
- mock_manager.return_value.search.return_value = []
- mock_manager.return_value.clear_cache = Mock()
-
- engine = LocalSearchEngine(
- paths=[str(folder)], collections=collections
- )
- results = engine.run("collection:research test query")
-
- # Should parse collection from query
- assert results == []
-
- def test_run_empty_previews(self, tmp_path):
- """Run returns empty when no previews found."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
- mock_manager.return_value.search.return_value = []
- mock_manager.return_value.clear_cache = Mock()
-
- engine = LocalSearchEngine(paths=[str(folder)])
- results = engine.run("test query")
-
- assert results == []
-
-
-class TestLocalSearchEngineGetCollectionsInfo:
- """Tests for LocalSearchEngine get_collections_info method."""
-
- def test_get_collections_info(self, tmp_path):
- """Get collections info returns collection details."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- collections = {
- "docs": {"paths": [str(folder)], "description": "Documents"},
- }
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
- mock_manager.return_value.indexed_folders = {}
- mock_manager.return_value.get_folder_hash.return_value = "abc123"
-
- engine = LocalSearchEngine(
- paths=[str(folder)], collections=collections
- )
- info = engine.get_collections_info()
-
- assert len(info) == 1
- assert info[0]["name"] == "docs"
- assert info[0]["description"] == "Documents"
-
-
-class TestLocalSearchEngineReindexCollection:
- """Tests for LocalSearchEngine reindex_collection method."""
-
- def test_reindex_collection_success(self, tmp_path):
- """Reindex collection successfully."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- collections = {
- "docs": {"paths": [str(folder)], "description": "Documents"},
- }
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
-
- engine = LocalSearchEngine(
- paths=[str(folder)], collections=collections
- )
- result = engine.reindex_collection("docs")
-
- assert result is True
-
- def test_reindex_collection_not_found(self, tmp_path):
- """Reindex collection that doesn't exist."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
-
- engine = LocalSearchEngine(paths=[str(folder)])
- result = engine.reindex_collection("nonexistent")
-
- assert result is False
-
-
-class TestLocalSearchEngineFromConfig:
- """Tests for LocalSearchEngine from_config class method."""
-
- def test_from_config_with_collections(self, tmp_path):
- """Create from config with collections."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- config = {
- "collections": {
- "docs": {"paths": [str(folder)], "description": "Documents"},
- },
- "max_results": 20,
- "embedding_model": "custom-model",
- "chunk_size": 500,
- }
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
-
- engine = LocalSearchEngine.from_config(config)
-
- assert engine.max_results == 20
- assert "docs" in engine.collections
-
- def test_from_config_with_folder_paths(self, tmp_path):
- """Create from config with folder_paths fallback."""
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- folder = tmp_path / "documents"
- folder.mkdir()
-
- config = {
- "folder_paths": [str(folder)],
- "max_results": 15,
- }
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager"
- ) as mock_manager:
- mock_manager.return_value.index_folder.return_value = True
-
- engine = LocalSearchEngine.from_config(config)
-
- assert engine.max_results == 15
- assert "default" in engine.collections
diff --git a/tests/web_search_engines/engines/test_search_engine_local_all.py b/tests/web_search_engines/engines/test_search_engine_local_all.py
deleted file mode 100644
index 27ce69258..000000000
--- a/tests/web_search_engines/engines/test_search_engine_local_all.py
+++ /dev/null
@@ -1,463 +0,0 @@
-"""
-Tests for the LocalAllSearchEngine class.
-
-Tests cover:
-- Initialization and configuration
-- Local engine discovery
-- Search across all collections
-- Preview aggregation
-- Full content retrieval
-"""
-
-from unittest.mock import Mock, patch
-
-
-class TestLocalAllSearchEngineInit:
- """Tests for LocalAllSearchEngine initialization."""
-
- def test_init_with_defaults(self):
- """Initialize with default values."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=[],
- ):
- engine = LocalAllSearchEngine()
-
- assert engine.max_results == 10
- assert engine.local_engines == {}
-
- def test_init_with_custom_max_results(self):
- """Initialize with custom max_results."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=[],
- ):
- engine = LocalAllSearchEngine(max_results=25)
-
- assert engine.max_results == 25
-
- def test_init_discovers_local_engines(self):
- """Initialize and discover local collection engines."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- mock_engine = Mock()
- mock_engine.name = "Test Collection"
- mock_engine.description = "Test description"
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=["collection1", "collection2"],
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine",
- return_value=mock_engine,
- ):
- engine = LocalAllSearchEngine()
-
- assert "collection1" in engine.local_engines
- assert "collection2" in engine.local_engines
-
- def test_init_handles_engine_creation_failure(self):
- """Initialize handles engine creation failure gracefully."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=["collection1"],
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine",
- side_effect=Exception("Engine creation failed"),
- ):
- engine = LocalAllSearchEngine()
-
- assert engine.local_engines == {}
-
- def test_init_handles_import_error(self):
- """Initialize handles ImportError for local_search_engines."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- side_effect=ImportError("No config found"),
- ):
- engine = LocalAllSearchEngine()
-
- assert engine.local_engines == {}
-
- def test_init_with_llm(self):
- """Initialize with LLM."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- mock_llm = Mock()
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=[],
- ):
- engine = LocalAllSearchEngine(llm=mock_llm)
-
- assert engine.llm is mock_llm
-
- def test_init_with_settings_snapshot(self):
- """Initialize with settings snapshot."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- settings = {"_username": "testuser"}
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=[],
- ):
- engine = LocalAllSearchEngine(settings_snapshot=settings)
-
- assert engine.settings_snapshot == settings
-
- def test_init_with_programmatic_mode(self):
- """Initialize with programmatic mode."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=[],
- ):
- engine = LocalAllSearchEngine(programmatic_mode=True)
-
- assert engine.programmatic_mode is True
-
-
-class TestGetPreviews:
- """Tests for _get_previews method."""
-
- def test_get_previews_returns_aggregated_results(self):
- """Get previews returns results from all local engines."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- mock_engine1 = Mock()
- mock_engine1.name = "Collection 1"
- mock_engine1.description = "Description 1"
- mock_engine1._get_previews.return_value = [
- {"id": "1", "snippet": "Result 1", "similarity": 0.9}
- ]
-
- mock_engine2 = Mock()
- mock_engine2.name = "Collection 2"
- mock_engine2.description = "Description 2"
- mock_engine2._get_previews.return_value = [
- {"id": "2", "snippet": "Result 2", "similarity": 0.8}
- ]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=["col1", "col2"],
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine"
- ) as mock_create:
- mock_create.side_effect = [mock_engine1, mock_engine2]
-
- engine = LocalAllSearchEngine()
- previews = engine._get_previews("test query")
-
- assert len(previews) == 2
- assert previews[0]["collection_id"] == "col1"
- assert previews[1]["collection_id"] == "col2"
-
- def test_get_previews_sorts_by_similarity(self):
- """Get previews sorts results by similarity."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- mock_engine1 = Mock()
- mock_engine1.name = "Collection 1"
- mock_engine1.description = "Description 1"
- mock_engine1._get_previews.return_value = [
- {"id": "1", "snippet": "Result 1", "similarity": 0.5}
- ]
-
- mock_engine2 = Mock()
- mock_engine2.name = "Collection 2"
- mock_engine2.description = "Description 2"
- mock_engine2._get_previews.return_value = [
- {"id": "2", "snippet": "Result 2", "similarity": 0.9}
- ]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=["col1", "col2"],
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine"
- ) as mock_create:
- mock_create.side_effect = [mock_engine1, mock_engine2]
-
- engine = LocalAllSearchEngine()
- previews = engine._get_previews("test query")
-
- # Higher similarity should come first
- assert previews[0]["similarity"] == 0.9
- assert previews[1]["similarity"] == 0.5
-
- def test_get_previews_limits_results(self):
- """Get previews limits results to max_results."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- mock_engine = Mock()
- mock_engine.name = "Collection"
- mock_engine.description = "Description"
- mock_engine._get_previews.return_value = [
- {
- "id": str(i),
- "snippet": f"Result {i}",
- "similarity": 0.9 - i * 0.1,
- }
- for i in range(10)
- ]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=["col1"],
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine",
- return_value=mock_engine,
- ):
- engine = LocalAllSearchEngine(max_results=5)
- previews = engine._get_previews("test query")
-
- assert len(previews) == 5
-
- def test_get_previews_empty_results(self):
- """Get previews handles no local engines."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=[],
- ):
- engine = LocalAllSearchEngine()
- previews = engine._get_previews("test query")
-
- assert previews == []
-
- def test_get_previews_handles_engine_error(self):
- """Get previews handles engine search error gracefully."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- mock_engine = Mock()
- mock_engine.name = "Collection"
- mock_engine.description = "Description"
- mock_engine._get_previews.side_effect = Exception("Search error")
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=["col1"],
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine",
- return_value=mock_engine,
- ):
- engine = LocalAllSearchEngine()
- previews = engine._get_previews("test query")
-
- assert previews == []
-
- def test_get_previews_adds_collection_info(self):
- """Get previews adds collection info to each preview."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- mock_engine = Mock()
- mock_engine.name = "My Collection"
- mock_engine.description = "Collection description"
- mock_engine._get_previews.return_value = [
- {"id": "1", "snippet": "Result"}
- ]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=["my_collection"],
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine",
- return_value=mock_engine,
- ):
- engine = LocalAllSearchEngine()
- previews = engine._get_previews("test query")
-
- assert previews[0]["collection_id"] == "my_collection"
- assert previews[0]["collection_name"] == "My Collection"
- assert (
- previews[0]["collection_description"]
- == "Collection description"
- )
-
-
-class TestGetFullContent:
- """Tests for _get_full_content method."""
-
- def test_get_full_content_delegates_to_engines(self):
- """Get full content delegates to appropriate collection engines."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- mock_engine = Mock()
- mock_engine.name = "Collection"
- mock_engine.description = "Description"
- mock_engine._get_full_content.return_value = [
- {"id": "1", "full_content": "Full content 1"}
- ]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=["col1"],
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine",
- return_value=mock_engine,
- ):
- engine = LocalAllSearchEngine()
-
- items = [{"id": "1", "collection_id": "col1"}]
- results = engine._get_full_content(items)
-
- assert len(results) == 1
- assert results[0]["full_content"] == "Full content 1"
-
- def test_get_full_content_groups_by_collection(self):
- """Get full content groups items by collection."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- mock_engine1 = Mock()
- mock_engine1.name = "Collection 1"
- mock_engine1.description = "Description 1"
- mock_engine1._get_full_content.return_value = [
- {"id": "1", "full_content": "Content 1"}
- ]
-
- mock_engine2 = Mock()
- mock_engine2.name = "Collection 2"
- mock_engine2.description = "Description 2"
- mock_engine2._get_full_content.return_value = [
- {"id": "2", "full_content": "Content 2"}
- ]
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=["col1", "col2"],
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine"
- ) as mock_create:
- mock_create.side_effect = [mock_engine1, mock_engine2]
-
- engine = LocalAllSearchEngine()
-
- items = [
- {"id": "1", "collection_id": "col1"},
- {"id": "2", "collection_id": "col2"},
- ]
- results = engine._get_full_content(items)
-
- assert len(results) == 2
-
- def test_get_full_content_handles_engine_error(self):
- """Get full content handles engine error gracefully."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- mock_engine = Mock()
- mock_engine.name = "Collection"
- mock_engine.description = "Description"
- mock_engine._get_full_content.side_effect = Exception("Content error")
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=["col1"],
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine",
- return_value=mock_engine,
- ):
- engine = LocalAllSearchEngine()
-
- items = [
- {"id": "1", "collection_id": "col1", "snippet": "Preview"}
- ]
- results = engine._get_full_content(items)
-
- # Should return original items on error
- assert len(results) == 1
- assert results[0]["id"] == "1"
-
- def test_get_full_content_handles_unknown_collection(self):
- """Get full content handles items with unknown collection."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=[],
- ):
- engine = LocalAllSearchEngine()
-
- items = [{"id": "1", "collection_id": "unknown_collection"}]
- results = engine._get_full_content(items)
-
- # Should return the unprocessed item
- assert len(results) == 1
- assert results[0]["id"] == "1"
-
- def test_get_full_content_handles_missing_collection_id(self):
- """Get full content handles items without collection_id."""
- from local_deep_research.web_search_engines.engines.search_engine_local_all import (
- LocalAllSearchEngine,
- )
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines",
- return_value=[],
- ):
- engine = LocalAllSearchEngine()
-
- items = [{"id": "1", "snippet": "No collection ID"}]
- results = engine._get_full_content(items)
-
- # Should return unprocessed item
- assert len(results) == 1
- assert results[0]["id"] == "1"
diff --git a/tests/web_search_engines/rate_limiting/test_tracker.py b/tests/web_search_engines/rate_limiting/test_tracker.py
index 1bd442dcd..9b238837b 100644
--- a/tests/web_search_engines/rate_limiting/test_tracker.py
+++ b/tests/web_search_engines/rate_limiting/test_tracker.py
@@ -236,37 +236,6 @@ class TestGetWaitTime:
assert wait_time == 0.1 # Optimistic default
- def test_local_engine_returns_zero(self):
- """Test that LocalSearchEngine gets zero wait time."""
- from local_deep_research.config.thread_settings import (
- NoSettingsContextError,
- )
-
- with patch(
- "local_deep_research.web_search_engines.rate_limiting.tracker.get_setting_from_snapshot"
- ) as mock_get_setting:
- mock_get_setting.side_effect = NoSettingsContextError("No settings")
-
- with patch(
- "local_deep_research.web_search_engines.rate_limiting.tracker.logger"
- ):
- with patch(
- "local_deep_research.web_search_engines.rate_limiting.tracker.get_search_context"
- ) as mock_context:
- mock_context.return_value = {"username": "test"}
-
- from local_deep_research.web_search_engines.rate_limiting.tracker import (
- AdaptiveRateLimitTracker,
- )
-
- tracker = AdaptiveRateLimitTracker(programmatic_mode=True)
- tracker.enabled = True
- tracker._estimates_loaded = True
-
- wait_time = tracker.get_wait_time("LocalSearchEngine")
-
- assert wait_time == 0.0
-
def test_known_engine_uses_estimate(self):
"""Test that known engine uses learned estimate."""
from local_deep_research.config.thread_settings import (
diff --git a/tests/web_search_engines/test_local_embedding_manager.py b/tests/web_search_engines/test_local_embedding_manager.py
index e5ff7efee..a76fd0f68 100644
--- a/tests/web_search_engines/test_local_embedding_manager.py
+++ b/tests/web_search_engines/test_local_embedding_manager.py
@@ -1,204 +1,69 @@
"""
-Tests for LocalEmbeddingManager cache directory handling and thread safety.
+Tests for LocalEmbeddingManager thread safety and cache directory utilities.
-These tests verify that the cache directory is properly resolved
-to an absolute path using the application's configured cache directory,
-and that embedding initialization is thread-safe.
+These tests verify that embedding initialization is thread-safe
+and that the cache directory utility returns correct paths.
"""
import os
import tempfile
import threading
-from pathlib import Path
from unittest.mock import MagicMock, patch
-class TestLocalEmbeddingManagerCacheDir:
- """Tests for LocalEmbeddingManager cache directory configuration."""
-
- def test_cache_dir_uses_absolute_path_when_none(self):
- """When cache_dir is None, should use get_cache_directory()."""
- # Create a temporary directory to use as the cache directory
- with tempfile.TemporaryDirectory() as temp_dir:
- temp_path = Path(temp_dir)
-
- # Mock dependencies to avoid loading actual models
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_cache_directory",
- return_value=temp_path,
- ):
- # Also mock the embeddings to avoid loading real models
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager._load_indexed_folders",
- return_value={},
- ):
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(
- embedding_model="test-model",
- cache_dir=None, # Should use get_cache_directory()
- )
-
- # Should resolve to temp_path / "local_search"
- expected_path = temp_path / "local_search"
- assert manager.cache_dir == expected_path
- assert manager.cache_dir.is_absolute()
-
- def test_cache_dir_uses_explicit_path_when_provided(self):
- """When cache_dir is provided, should use that path."""
- with tempfile.TemporaryDirectory() as temp_dir:
- explicit_path = str(Path(temp_dir) / "my_custom_cache")
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager._load_indexed_folders",
- return_value={},
- ):
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- manager = LocalEmbeddingManager(
- embedding_model="test-model",
- cache_dir=explicit_path,
- )
-
- assert manager.cache_dir == Path(explicit_path)
-
- def test_cache_dir_not_relative(self):
- """Cache dir should never be a relative path like '.cache'."""
- with tempfile.TemporaryDirectory() as temp_dir:
- temp_path = Path(temp_dir)
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_cache_directory",
- return_value=temp_path,
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager._load_indexed_folders",
- return_value={},
- ):
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
-
- # Default behavior (cache_dir=None) should NOT result in .cache
- manager = LocalEmbeddingManager(
- embedding_model="test-model",
- )
-
- # The path should not start with ".cache"
- assert not str(manager.cache_dir).startswith(".cache")
- # The path should be absolute
- assert manager.cache_dir.is_absolute()
-
-
-class TestLocalSearchEngineCacheDir:
- """Tests for LocalSearchEngine cache directory configuration."""
-
- def test_from_config_uses_none_for_missing_cache_dir(self):
- """from_config should pass None when cache_dir not in config."""
- with tempfile.TemporaryDirectory() as temp_dir:
- temp_path = Path(temp_dir)
-
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_cache_directory",
- return_value=temp_path,
- ):
- with patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager._load_indexed_folders",
- return_value={},
- ):
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalSearchEngine,
- )
-
- # Create engine from config without cache_dir specified
- config = {
- "folder_paths": [
- temp_dir
- ], # Use temp_dir as a valid folder
- }
-
- engine = LocalSearchEngine.from_config(config)
-
- # The embedding manager's cache_dir should be absolute
- assert engine.embedding_manager.cache_dir.is_absolute()
- # Should not be the old relative path
- assert not str(
- engine.embedding_manager.cache_dir
- ).startswith(".cache")
-
-
class TestEmbeddingThreadSafety:
"""Tests that embedding initialization is thread-safe."""
def test_concurrent_embedding_access_initializes_once(self):
"""Multiple threads accessing .embeddings should only init once."""
- with tempfile.TemporaryDirectory() as temp_dir:
- temp_path = Path(temp_dir)
+ from local_deep_research.web_search_engines.engines.local_embedding_manager import (
+ LocalEmbeddingManager,
+ )
- with (
- patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.get_cache_directory",
- return_value=temp_path,
- ),
- patch(
- "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager._load_indexed_folders",
- return_value={},
- ),
- ):
- from local_deep_research.web_search_engines.engines.search_engine_local import (
- LocalEmbeddingManager,
- )
+ manager = LocalEmbeddingManager(
+ embedding_model="test-model",
+ )
- manager = LocalEmbeddingManager(
- embedding_model="test-model",
- )
+ # Track how many times _initialize_embeddings is called
+ init_count = 0
+ init_lock = threading.Lock()
+ mock_embeddings = MagicMock()
- # Track how many times _initialize_embeddings is called
- init_count = 0
- init_lock = threading.Lock()
- mock_embeddings = MagicMock()
+ def counting_init():
+ nonlocal init_count
+ with init_lock:
+ init_count += 1
+ # Simulate slow initialization to widen race window
+ import time
- def counting_init():
- nonlocal init_count
- with init_lock:
- init_count += 1
- # Simulate slow initialization to widen race window
- import time
+ time.sleep(0.1)
+ return mock_embeddings
- time.sleep(0.1)
- return mock_embeddings
+ manager._initialize_embeddings = counting_init
- manager._initialize_embeddings = counting_init
+ # Access embeddings from multiple threads concurrently
+ results = []
+ errors = []
- # Access embeddings from multiple threads concurrently
- results = []
- errors = []
+ def access_embeddings():
+ try:
+ emb = manager.embeddings
+ results.append(emb)
+ except Exception as e:
+ errors.append(e)
- def access_embeddings():
- try:
- emb = manager.embeddings
- results.append(emb)
- except Exception as e:
- errors.append(e)
+ threads = [threading.Thread(target=access_embeddings) for _ in range(4)]
+ for t in threads:
+ t.start()
+ for t in threads:
+ t.join()
- threads = [
- threading.Thread(target=access_embeddings) for _ in range(4)
- ]
- for t in threads:
- t.start()
- for t in threads:
- t.join()
-
- assert not errors, f"Threads raised errors: {errors}"
- assert init_count == 1, (
- f"_initialize_embeddings called {init_count} times, expected 1"
- )
- # All threads should get the same instance
- assert all(r is mock_embeddings for r in results)
+ assert not errors, f"Threads raised errors: {errors}"
+ assert init_count == 1, (
+ f"_initialize_embeddings called {init_count} times, expected 1"
+ )
+ # All threads should get the same instance
+ assert all(r is mock_embeddings for r in results)
class TestGetCacheDirectory:
diff --git a/tests/web_search_engines/test_search_engines_config.py b/tests/web_search_engines/test_search_engines_config.py
index a6bdb640d..9f03bb3a5 100644
--- a/tests/web_search_engines/test_search_engines_config.py
+++ b/tests/web_search_engines/test_search_engines_config.py
@@ -326,123 +326,6 @@ class TestSearchConfig:
result["custom_retriever"]["class_name"] == "RetrieverSearchEngine"
)
- @patch(
- "local_deep_research.web_search_engines.retriever_registry.retriever_registry"
- )
- @patch(
- "local_deep_research.web_search_engines.search_engines_config._get_setting"
- )
- def test_processes_local_collections(self, mock_get_setting, mock_registry):
- """Should process local collection configurations."""
- mock_registry.list_registered.return_value = []
-
- def get_setting_side_effect(key, default, **kwargs):
- if key == "search.engine.local":
- return {
- "my_docs.enabled": True,
- "my_docs.paths": '["./docs"]',
- }
- return default
-
- mock_get_setting.side_effect = get_setting_side_effect
-
- from local_deep_research.web_search_engines.search_engines_config import (
- search_config,
- )
-
- result = search_config()
- assert "my_docs" in result
- assert result["my_docs"]["requires_llm"] is True
-
- @patch(
- "local_deep_research.web_search_engines.retriever_registry.retriever_registry"
- )
- @patch(
- "local_deep_research.web_search_engines.search_engines_config._get_setting"
- )
- def test_skips_disabled_local_collections(
- self, mock_get_setting, mock_registry
- ):
- """Should skip disabled local collections."""
- mock_registry.list_registered.return_value = []
-
- def get_setting_side_effect(key, default, **kwargs):
- if key == "search.engine.local":
- return {
- "disabled_docs.enabled": False,
- "disabled_docs.paths": '["./docs"]',
- }
- return default
-
- mock_get_setting.side_effect = get_setting_side_effect
-
- from local_deep_research.web_search_engines.search_engines_config import (
- search_config,
- )
-
- result = search_config()
- assert "disabled_docs" not in result
-
- @patch(
- "local_deep_research.web_search_engines.retriever_registry.retriever_registry"
- )
- @patch(
- "local_deep_research.web_search_engines.search_engines_config._get_setting"
- )
- def test_parses_json_paths_for_local_collection(
- self, mock_get_setting, mock_registry
- ):
- """Should parse JSON array for local collection paths."""
- mock_registry.list_registered.return_value = []
-
- def get_setting_side_effect(key, default, **kwargs):
- if key == "search.engine.local":
- return {
- "my_docs.enabled": True,
- "my_docs.paths": '["./path1", "./path2"]',
- }
- return default
-
- mock_get_setting.side_effect = get_setting_side_effect
-
- from local_deep_research.web_search_engines.search_engines_config import (
- search_config,
- )
-
- result = search_config()
- assert result["my_docs"]["default_params"]["paths"] == [
- "./path1",
- "./path2",
- ]
-
- @patch(
- "local_deep_research.web_search_engines.retriever_registry.retriever_registry"
- )
- @patch(
- "local_deep_research.web_search_engines.search_engines_config._get_setting"
- )
- def test_handles_invalid_json_paths(self, mock_get_setting, mock_registry):
- """Should handle invalid JSON in paths gracefully."""
- mock_registry.list_registered.return_value = []
-
- def get_setting_side_effect(key, default, **kwargs):
- if key == "search.engine.local":
- return {
- "my_docs.enabled": True,
- "my_docs.paths": "invalid json",
- }
- return default
-
- mock_get_setting.side_effect = get_setting_side_effect
-
- from local_deep_research.web_search_engines.search_engines_config import (
- search_config,
- )
-
- result = search_config()
- # Should set to empty list on JSON error
- assert result["my_docs"]["default_params"]["paths"] == []
-
@patch(
"local_deep_research.web_search_engines.retriever_registry.retriever_registry"
)
@@ -717,140 +600,3 @@ class TestDefaultSearchEngine:
default_search_engine(settings_snapshot=snapshot)
call_kwargs = mock_get_setting.call_args[1]
assert call_kwargs["settings_snapshot"] is snapshot
-
-
-class TestLocalSearchEngines:
- """Tests for local_search_engines function."""
-
- @patch(
- "local_deep_research.web_search_engines.search_engines_config._get_setting"
- )
- def test_returns_list_of_enabled_collections(self, mock_get_setting):
- """Should return list of enabled local collection names."""
-
- def get_setting_side_effect(key, default, **kwargs):
- if key == "search.engine.local":
- return {
- "docs1.enabled": True,
- "docs2.enabled": True,
- }
- return default
-
- mock_get_setting.side_effect = get_setting_side_effect
-
- from local_deep_research.web_search_engines.search_engines_config import (
- local_search_engines,
- )
-
- result = local_search_engines()
- assert isinstance(result, list)
- assert "docs1" in result
- assert "docs2" in result
-
- @patch(
- "local_deep_research.web_search_engines.search_engines_config._get_setting"
- )
- def test_excludes_disabled_collections(self, mock_get_setting):
- """Should exclude disabled collections."""
-
- def get_setting_side_effect(key, default, **kwargs):
- if key == "search.engine.local":
- return {
- "enabled_docs.enabled": True,
- "disabled_docs.enabled": False,
- }
- return default
-
- mock_get_setting.side_effect = get_setting_side_effect
-
- from local_deep_research.web_search_engines.search_engines_config import (
- local_search_engines,
- )
-
- result = local_search_engines()
- assert "enabled_docs" in result
- assert "disabled_docs" not in result
-
- @patch(
- "local_deep_research.web_search_engines.search_engines_config._get_setting"
- )
- def test_excludes_local_all_collection(self, mock_get_setting):
- """Should exclude the 'local_all' collection."""
-
- def get_setting_side_effect(key, default, **kwargs):
- if key == "search.engine.local":
- return {
- "local_all.enabled": True,
- "my_docs.enabled": True,
- }
- return default
-
- mock_get_setting.side_effect = get_setting_side_effect
-
- from local_deep_research.web_search_engines.search_engines_config import (
- local_search_engines,
- )
-
- result = local_search_engines()
- assert "local_all" not in result
- assert "my_docs" in result
-
- @patch(
- "local_deep_research.web_search_engines.search_engines_config._get_setting"
- )
- def test_returns_empty_list_when_no_local_engines(self, mock_get_setting):
- """Should return empty list when no local engines configured."""
- mock_get_setting.return_value = {}
-
- from local_deep_research.web_search_engines.search_engines_config import (
- local_search_engines,
- )
-
- result = local_search_engines()
- assert result == []
-
- @patch(
- "local_deep_research.web_search_engines.search_engines_config._get_setting"
- )
- def test_treats_missing_enabled_as_true(self, mock_get_setting):
- """Should treat missing 'enabled' field as True (default enabled)."""
-
- def get_setting_side_effect(key, default, **kwargs):
- if key == "search.engine.local":
- return {
- "implicit_enabled.paths": '["./docs"]',
- }
- return default
-
- mock_get_setting.side_effect = get_setting_side_effect
-
- from local_deep_research.web_search_engines.search_engines_config import (
- local_search_engines,
- )
-
- result = local_search_engines()
- assert "implicit_enabled" in result
-
- @patch(
- "local_deep_research.web_search_engines.search_engines_config._get_setting"
- )
- def test_passes_all_parameters_to_get_setting(self, mock_get_setting):
- """Should pass username, db_session, and settings_snapshot."""
- mock_get_setting.return_value = {}
- mock_session = MagicMock()
- snapshot = {"test": "value"}
-
- from local_deep_research.web_search_engines.search_engines_config import (
- local_search_engines,
- )
-
- local_search_engines(
- username="testuser",
- db_session=mock_session,
- settings_snapshot=snapshot,
- )
-
- call_kwargs = mock_get_setting.call_args[1]
- assert call_kwargs["username"] == "testuser"
- assert call_kwargs["db_session"] is mock_session
- assert call_kwargs["settings_snapshot"] is snapshot
diff --git a/unraid-templates/local-deep-research.xml b/unraid-templates/local-deep-research.xml
index 0b87cb489..3acf267b3 100644
--- a/unraid-templates/local-deep-research.xml
+++ b/unraid-templates/local-deep-research.xml
@@ -35,9 +35,6 @@ This template includes the main LDR service. For full functionality, you may als
5000
/mnt/user/appdata/local-deep-research/data
/mnt/user/appdata/local-deep-research/scripts
-
-
-
0.0.0.0
5000
/data