From 33119ae2a4e928bb54dcd25380fde16117c52e82 Mon Sep 17 00:00:00 2001 From: LearningCircuit <185559241+LearningCircuit@users.noreply.github.com> Date: Sat, 28 Feb 2026 16:00:13 +0100 Subject: [PATCH] refactor: remove deprecated settings-based local search engines (#2344) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: remove deprecated settings-based local search engines The old settings-based local engines (research_papers, project_docs, personal_notes, local_all) are fully superseded by the database-backed Collection system with CollectionSearchEngine and LibraryRAGSearchEngine. - Delete LocalAllSearchEngine and LocalSearchEngine classes - Remove 58 settings entries from default_settings.json - Remove local engine registration from search_engines_config.py - Remove local_search_engines() function - Clean up LocalEmbeddingManager: remove 14 dead methods and unused attrs - Remove Docker volume mounts for local_collections - Update security whitelist, rate limiter, bearer config - Remove dead force_reindex code path in research_functions.py - Update docs to reference Collections UI - Remove/update all associated tests - Regenerate golden master settings * fix: address review comments from djpetti - Revert unintentional formatting change in theme options (keep compact inline format) - Restore unicode arrow character (→) that was escaped to \u2192 by JSON serializer - Rename search_engine_local.py → local_embedding_manager.py since it only contains LocalEmbeddingManager now (no search engines) - Remove unused chunk_size, chunk_overlap, cache_dir params from LocalEmbeddingManager - Update all imports and references across codebase --- .github/scripts/check-file-writes.sh | 2 +- .secrets.baseline | 4 +- bearer.yml | 1 - docker-compose.unraid.yml | 5 - docker-compose.yml | 14 - docs/deployment/unraid.md | 25 +- docs/docker-compose-guide.md | 12 +- docs/faq.md | 29 +- .../api/research_functions.py | 7 - src/local_deep_research/defaults/__init__.py | 1 - .../defaults/default_settings.json | 900 ---------- .../research_library/routes/rag_routes.py | 4 +- .../services/library_rag_service.py | 4 +- .../security/module_whitelist.py | 5 +- src/local_deep_research/web/api.py | 2 +- .../engines/local_embedding_manager.py | 290 ++++ .../engines/search_engine_local.py | 1467 ----------------- .../engines/search_engine_local_all.py | 170 -- .../rate_limiting/tracker.py | 1 - .../search_engines_config.py | 105 +- tests/rate_limiting/test_rate_limiting.py | 10 - .../routes/test_rag_routes.py | 2 +- .../test_absolute_module_paths_hook.py | 2 +- tests/settings/golden_master_settings.json | 900 ---------- .../engines/test_local_embedding_manager.py | 96 ++ .../engines/test_search_engine_local.py | 1073 ------------ .../engines/test_search_engine_local_all.py | 463 ------ .../rate_limiting/test_tracker.py | 31 - .../test_local_embedding_manager.py | 219 +-- .../test_search_engines_config.py | 254 --- unraid-templates/local-deep-research.xml | 3 - 31 files changed, 454 insertions(+), 5647 deletions(-) create mode 100644 src/local_deep_research/web_search_engines/engines/local_embedding_manager.py delete mode 100644 src/local_deep_research/web_search_engines/engines/search_engine_local.py delete mode 100644 src/local_deep_research/web_search_engines/engines/search_engine_local_all.py create mode 100644 tests/web_search_engines/engines/test_local_embedding_manager.py delete mode 100644 tests/web_search_engines/engines/test_search_engine_local.py delete mode 100644 tests/web_search_engines/engines/test_search_engine_local_all.py diff --git a/.github/scripts/check-file-writes.sh b/.github/scripts/check-file-writes.sh index d140fabfd..74b0820ad 100755 --- a/.github/scripts/check-file-writes.sh +++ b/.github/scripts/check-file-writes.sh @@ -181,7 +181,7 @@ if [ -n "$ALL_MATCHES" ]; then # Filter system config files (not user data) if [ "$skip_line" -eq 0 ]; then - if echo "$line" | grep -qE "web/app_factory\.py|web/server_config\.py|web_search_engines/engines/search_engine_local\.py|document_loaders/bytes_loader\.py"; then + if echo "$line" | grep -qE "web/app_factory\.py|web/server_config\.py|document_loaders/bytes_loader\.py"; then skip_line=1 fi fi diff --git a/.secrets.baseline b/.secrets.baseline index da1b4b1a7..0df2e81b6 100644 --- a/.secrets.baseline +++ b/.secrets.baseline @@ -677,7 +677,7 @@ "filename": "src/local_deep_research/defaults/default_settings.json", "hashed_secret": "7b976de60179f0603eec51250b33aacd56d90ef6", "is_verified": false, - "line_number": 5946 + "line_number": 5046 } ], "src/local_deep_research/llm/providers/implementations/anthropic.py": [ @@ -5188,5 +5188,5 @@ } ] }, - "generated_at": "2026-02-28T11:10:23Z" + "generated_at": "2026-02-28T12:03:11Z" } diff --git a/bearer.yml b/bearer.yml index f7c85d74b..7bb2ca043 100644 --- a/bearer.yml +++ b/bearer.yml @@ -26,7 +26,6 @@ rule: # for these non-cryptographic uses where collision resistance is not critical. # # Usage locations: - # - search_engine_local.py: Cache key generation for search results # - research_service.py: Content deduplication hashes # - search_cache.py: Cache key generation # - benchmark_service.py: Test result identification diff --git a/docker-compose.unraid.yml b/docker-compose.unraid.yml index 3115a0fff..b5d91470a 100644 --- a/docker-compose.unraid.yml +++ b/docker-compose.unraid.yml @@ -22,11 +22,6 @@ services: - /mnt/user/appdata/local-deep-research/data:/data - /mnt/user/appdata/local-deep-research/scripts:/scripts - # Optional: Uncomment to add your document directories - # - /mnt/user/documents/personal:/local_collections/personal_notes:ro - # - /mnt/user/documents/projects:/local_collections/project_docs:ro - # - /mnt/user/documents/papers:/local_collections/research_papers:ro - ollama: volumes: # Override named volume with Unraid path diff --git a/docker-compose.yml b/docker-compose.yml index 115a6ed9c..849de6621 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -122,20 +122,6 @@ services: volumes: - ldr_data:/data - ldr_scripts:/scripts - # ============================================================================ - # LOCAL DOCUMENT COLLECTIONS (Optional) - # Mount your document directories here to search them with LDR. - # - # For Unraid users, replace paths with your Unraid shares: - # - /mnt/user/documents/personal:/local_collections/personal_notes/:ro - # - /mnt/user/documents/projects:/local_collections/project_docs/:ro - # - /mnt/user/papers:/local_collections/research_papers/:ro - # - # The :ro (read-only) suffix is recommended for safety. - # ============================================================================ - - ./local_collections/personal_notes:/local_collections/personal_notes/:ro - - ./local_collections/project_docs:/local_collections/project_docs/:ro - - ./local_collections/research_papers:/local_collections/research_papers/:ro # ============================================================================ # CONTAINER SECURITY — Principle of Least Privilege # ============================================================================ diff --git a/docs/deployment/unraid.md b/docs/deployment/unraid.md index 8bdf8dab2..e071ee0d7 100644 --- a/docs/deployment/unraid.md +++ b/docs/deployment/unraid.md @@ -117,7 +117,6 @@ All volumes should be under `/mnt/user/appdata/local-deep-research/` for best pr | `/scripts` | `/mnt/user/appdata/local-deep-research/scripts` | Startup scripts (for Ollama integration) | Yes | | `/root/.ollama` (ollama) | `/mnt/user/appdata/local-deep-research/ollama` | Downloaded LLM models (5-15GB each) | If using Ollama | | `/etc/searxng` (searxng) | `/mnt/user/appdata/local-deep-research/searxng` | SearXNG configuration | If using SearXNG | -| `/local_collections/*` | `/mnt/user/documents/*` | Your document directories to search | Optional | **Performance Tip:** If your appdata share is set to "cache-only", you can use `/mnt/cache/appdata/local-deep-research/` instead of `/mnt/user/appdata/local-deep-research/` for better performance (bypasses FUSE overhead). @@ -183,26 +182,12 @@ If running LDR alone with external services: ## 🎮 Using Local Documents -To search your Unraid shares (documents, notes, etc.): +To search your local documents, use the **Collections** system in the Web UI: -**For Template Installation (Method 1):** -1. Edit the container -2. Add **Path** mappings under volume configuration: - - **Container Path:** `/local_collections/personal_notes` → **Host Path:** `/mnt/user/documents/personal` (Read-only) - - **Container Path:** `/local_collections/project_docs` → **Host Path:** `/mnt/user/documents/projects` (Read-only) - - **Container Path:** `/local_collections/research_papers` → **Host Path:** `/mnt/user/papers` (Read-only) -3. Apply changes and restart - -**For Docker Compose Installation (Method 2):** -1. Edit `docker-compose.unraid.yml` -2. Uncomment the document collection lines and adjust paths: - ```yaml - - /mnt/user/documents/personal:/local_collections/personal_notes:ro - - /mnt/user/documents/projects:/local_collections/project_docs:ro - ``` -3. Run **Compose Down** then **Compose Up** to apply changes - -These paths will then be available in LDR's WebUI Settings for searching. +1. Open the LDR Web UI and navigate to the **Collections** page +2. Create a new collection (e.g., "Research Papers", "Project Docs") +3. Upload documents directly through the browser — no volume mounts needed +4. Select your collection as a search engine, or use **"Search All Collections"** to search across everything ## 🎯 GPU Acceleration (NVIDIA) diff --git a/docs/docker-compose-guide.md b/docs/docker-compose-guide.md index 67ca524f7..cba909c81 100644 --- a/docs/docker-compose-guide.md +++ b/docs/docker-compose-guide.md @@ -84,16 +84,8 @@ ports: ### Local Document Collections -Mount directories to search your own documents: - -```yaml -volumes: - - ./local_collections/personal_notes:/local_collections/personal_notes/ - - ./local_collections/project_docs:/local_collections/project_docs/ - - /path/to/your/papers:/local_collections/research_papers/:ro -``` - -The `:ro` suffix makes mounts read-only for safety. +Use the **Collections** system in the Web UI to manage your local documents. +Upload files directly through the Collections page — no volume mounts required. ## Advanced: Cookie Cutter Configuration diff --git a/docs/faq.md b/docs/faq.md index f3cff4bad..71416fc58 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -325,32 +325,21 @@ See also: [Environment Variables Documentation](env_configuration.md#openrouter) ## Local Document Search -### How do I configure local document paths? +### How do I search my local documents? -1. **In Web UI**: - - Settings → Search for "local" - - Edit "Document Collection Paths" - - Use absolute paths: `["/home/user/documents", "/data/pdfs"]` +Use the **Collections** system in the Web UI: -2. **For Docker**: Mount volumes - ```bash - docker run -v /host/path:/container/path ... - ``` - Then use container path in settings: `["/container/path"]` +1. **Navigate** to the Collections page in the sidebar +2. **Create a collection** (e.g., "Research Papers", "Project Docs") +3. **Upload documents** directly through the UI — supported formats include PDF, TXT, MD, DOCX, and many more +4. **Search** your collections by selecting them as a search engine, or use **"Search All Collections"** (Library RAG) to search across everything ### Local search not finding documents Common issues: -1. **First search is slow** - Initial indexing takes time -2. **Path format** - Use absolute paths, not relative -3. **File types** - Ensure supported formats (PDF, TXT, MD, DOCX) -4. **Permissions** - Check read permissions - -### The @format syntax in settings - -This is a UI hint to expand environment variables. Replace with actual paths: -- Change: `"@format ${DOCS_DIR}/personal_notes"` -- To: `"/home/user/documents/personal_notes"` +1. **First search is slow** — initial indexing takes time +2. **File types** — ensure supported formats (PDF, TXT, MD, DOCX) +3. **Collection not indexed** — re-upload or re-index via the Collections UI ## Performance & Optimization diff --git a/src/local_deep_research/api/research_functions.py b/src/local_deep_research/api/research_functions.py index ff5b9aa47..ecdf6afa1 100644 --- a/src/local_deep_research/api/research_functions.py +++ b/src/local_deep_research/api/research_functions.py @@ -577,13 +577,6 @@ def analyze_documents( # Set max results search.max_results = max_results - # Force reindex if requested - if force_reindex and hasattr(search, "embedding_manager"): - for folder_path in search.folder_paths: - search.embedding_manager.index_folder( - folder_path, force_reindex=True - ) - # Perform the search results = search.run(query) diff --git a/src/local_deep_research/defaults/__init__.py b/src/local_deep_research/defaults/__init__.py index d25ece016..71c0a5fce 100644 --- a/src/local_deep_research/defaults/__init__.py +++ b/src/local_deep_research/defaults/__init__.py @@ -15,7 +15,6 @@ DEFAULTS_DIR = Path(__file__).parent # Default files available in this package DEFAULT_FILES = { "main.toml": DEFAULTS_DIR / "main.toml", - "local_collections.toml": DEFAULTS_DIR / "local_collections.toml", "search_engines.toml": DEFAULTS_DIR / "search_engines.toml", } diff --git a/src/local_deep_research/defaults/default_settings.json b/src/local_deep_research/defaults/default_settings.json index 268b7ca32..d8858cfad 100644 --- a/src/local_deep_research/defaults/default_settings.json +++ b/src/local_deep_research/defaults/default_settings.json @@ -2393,34 +2393,6 @@ ], "visible": true }, - "search.engine.local.local_all.display_name": { - "category": "local_all", - "description": "Display name to use in the U.I. for this search engine.", - "editable": false, - "max_value": null, - "min_value": null, - "name": "Display Name", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Local Documents", - "visible": false - }, - "search.engine.local.local_all.description": { - "category": "local_all", - "description": "Human-readable description of the search engine.", - "editable": false, - "max_value": null, - "min_value": null, - "name": "Description", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Search only local documents using RAG.", - "visible": false - }, "search.engine.DEFAULT_SEARCH_ENGINE": { "category": "local_all", "description": "Fallback search engine used when the configured engine is unavailable or has errors.", @@ -2435,585 +2407,6 @@ "value": "wikipedia", "visible": true }, - "search.engine.local.local_all.class_name": { - "category": "local_all", - "description": "Internal: Python class implementing local document search. Do not modify.", - "editable": false, - "max_value": null, - "min_value": null, - "name": "Class Name", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "LocalAllSearchEngine", - "visible": false - }, - "search.engine.local.local_all.module_path": { - "category": "local_all", - "description": "Internal: Python module path for local search implementation. Do not modify.", - "editable": false, - "max_value": null, - "min_value": null, - "name": "Module Path", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": ".engines.search_engine_local_all", - "visible": false - }, - "search.engine.local.local_all.reliability": { - "category": "local_all", - "description": "Reliability score (0-1) for local search. Quality depends on your document collection and indexing.", - "editable": true, - "max_value": 1.0, - "min_value": 0.0, - "name": "Reliability", - "options": null, - "step": 0.05, - "type": "SEARCH", - "ui_element": "range", - "value": 0.85, - "visible": true - }, - "search.engine.local.local_all.requires_api_key": { - "category": "local_all", - "description": "Local document search does not require any external API keys.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Requires Api Key", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": false, - "visible": true - }, - "search.engine.local.local_all.requires_llm": { - "category": "local_all", - "description": "Indicates this engine uses the LLM to rerank and filter results for relevance.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Requires Llm", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": true, - "visible": true - }, - "search.engine.local.local_all.strengths": { - "category": "local_all", - "description": "Advantages: Searches all local document collections at once, works offline, uses your private documents.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Strengths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "searches all local collections", - "personal documents", - "offline access" - ], - "visible": true - }, - "search.engine.local.local_all.weaknesses": { - "category": "local_all", - "description": "Limitations: May return too many results from mixed collections, requires documents to be indexed first.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Weaknesses", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "may return too many results", - "requires indexing" - ], - "visible": true - }, - "search.engine.local.personal_notes.cache_dir": { - "category": "personal_notes", - "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Cache Dir", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": null, - "visible": true - }, - "search.engine.local.personal_notes.chunk_overlap": { - "category": "personal_notes", - "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.", - "editable": true, - "max_value": null, - "min_value": 0, - "name": "Chunk Overlap", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 100, - "visible": true - }, - "search.engine.local.personal_notes.chunk_size": { - "category": "personal_notes", - "description": "Maximum characters per chunk when splitting documents for RAG indexing. Smaller = more precise, larger = more context.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Chunk Size", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 500, - "visible": true - }, - "search.engine.local.personal_notes.description": { - "category": "personal_notes", - "description": "Human-readable description of this document collection shown in the UI.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Description", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Personal notes and documents", - "visible": true - }, - "search.engine.local.personal_notes.embedding_device": { - "category": "personal_notes", - "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Device", - "options": [ - { - "label": "CPU", - "value": "cpu" - }, - { - "label": "CUDA", - "value": "cuda" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "cpu", - "visible": true - }, - "search.engine.local.personal_notes.embedding_model": { - "category": "personal_notes", - "description": "Model for generating text embeddings. Default 'all-MiniLM-L6-v2' is fast and works well; larger models may improve accuracy.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "all-MiniLM-L6-v2", - "visible": true - }, - "search.engine.local.personal_notes.embedding_model_type": { - "category": "personal_notes", - "description": "Model provider to use for generating document embeddings.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model Type", - "options": [ - { - "label": "SentenceTransformers", - "value": "sentence_transformers" - }, - { - "label": "Ollama", - "value": "ollama" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "sentence_transformers", - "visible": true - }, - "search.engine.local.personal_notes.enabled": { - "category": "personal_notes", - "description": "Enable this document collection for searching. Disable if you don't want to index these documents.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Enabled", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": true, - "visible": true - }, - "search.engine.local.personal_notes.max_filtered_results": { - "category": "personal_notes", - "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Max Filtered Results", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": 10, - "visible": true - }, - "search.engine.local.personal_notes.max_results": { - "category": "personal_notes", - "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Max Results", - "options": null, - "step": 1, - "type": "SEARCH", - "ui_element": "number", - "value": 30, - "visible": true - }, - "search.engine.local.personal_notes.name": { - "category": "personal_notes", - "description": "Internal identifier for this collection. Used in logs and configuration.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Name", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Personal Notes", - "visible": true - }, - "search.engine.local.personal_notes.paths": { - "category": "personal_notes", - "description": "File paths to include in this collection. Supports directories (recursively indexed) and individual files.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Paths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "/local_collections/personal_notes" - ], - "visible": true - }, - "search.engine.local.personal_notes.reliability": { - "category": "personal_notes", - "description": "Reliability score (0-1). Personal notes are rated lower (0.75) as they may contain informal or subjective content.", - "editable": true, - "max_value": 1.0, - "min_value": 0.0, - "name": "Reliability", - "options": null, - "step": 0.05, - "type": "SEARCH", - "ui_element": "range", - "value": 0.75, - "visible": true - }, - "search.engine.local.personal_notes.strengths": { - "category": "personal_notes", - "description": "Advantages: Access to your personal knowledge, notes, and private documents not available elsewhere.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Strengths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "personal knowledge", - "notes", - "private documents" - ], - "visible": true - }, - "search.engine.local.personal_notes.weaknesses": { - "category": "personal_notes", - "description": "Limitations: Content may be subjective, informal, or incomplete compared to published sources.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Weaknesses", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "subjective content", - "informal information" - ], - "visible": true - }, - "search.engine.local.project_docs.cache_dir": { - "category": "project_docs", - "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Cache Dir", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": null, - "visible": true - }, - "search.engine.local.project_docs.chunk_overlap": { - "category": "project_docs", - "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.", - "editable": true, - "max_value": null, - "min_value": 0, - "name": "Chunk Overlap", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 200, - "visible": true - }, - "search.engine.local.project_docs.chunk_size": { - "category": "project_docs", - "description": "Maximum characters per chunk when splitting documents for RAG indexing. Larger default (1000) suits technical documentation.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Chunk Size", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 1000, - "visible": true - }, - "search.engine.local.project_docs.description": { - "category": "project_docs", - "description": "Human-readable description of this document collection shown in the UI.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Description", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Project documentation and specifications", - "visible": true - }, - "search.engine.local.project_docs.embedding_device": { - "category": "project_docs", - "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Device", - "options": [ - { - "label": "CPU", - "value": "cpu" - }, - { - "label": "CUDA", - "value": "cuda" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "cpu", - "visible": true - }, - "search.engine.local.project_docs.embedding_model": { - "category": "project_docs", - "description": "Model for generating text embeddings. Default 'all-MiniLM-L6-v2' is fast and works well; larger models may improve accuracy.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "all-MiniLM-L6-v2", - "visible": true - }, - "search.engine.local.project_docs.embedding_model_type": { - "category": "project_docs", - "description": "Model provider to use for generating document embeddings.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model Type", - "options": [ - { - "label": "SentenceTransformers", - "value": "sentence_transformers" - }, - { - "label": "Ollama", - "value": "ollama" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "sentence_transformers", - "visible": true - }, - "search.engine.local.project_docs.enabled": { - "category": "project_docs", - "description": "Enable this document collection for searching. Disable if you don't want to index these documents.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Enabled", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": true, - "visible": true - }, - "search.engine.local.project_docs.max_filtered_results": { - "category": "project_docs", - "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Max Filtered Results", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": 5, - "visible": true - }, - "search.engine.local.project_docs.max_results": { - "category": "project_docs", - "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Max Results", - "options": null, - "step": 1, - "type": "SEARCH", - "ui_element": "number", - "value": 20, - "visible": true - }, - "search.engine.local.project_docs.name": { - "category": "project_docs", - "description": "Internal identifier for this collection. Used in logs and configuration.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Name", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Project Documents", - "visible": true - }, - "search.engine.local.project_docs.paths": { - "category": "project_docs", - "description": "File paths to include in this collection. Supports directories (recursively indexed) and individual files.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Paths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "/local_collections/project_docs/" - ], - "visible": true - }, - "search.engine.local.project_docs.reliability": { - "category": "project_docs", - "description": "Reliability score (0-1). Project docs rated moderately (0.8) as they are semi-formal technical content.", - "editable": true, - "max_value": 1.0, - "min_value": 0.0, - "name": "Reliability", - "options": null, - "step": 0.05, - "type": "SEARCH", - "ui_element": "range", - "value": 0.9, - "visible": true - }, - "search.engine.local.project_docs.strengths": { - "category": "project_docs", - "description": "Advantages: Access to project-specific technical docs, READMEs, and internal documentation not available online.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Strengths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "project documentation", - "specifications", - "internal documents" - ], - "visible": true - }, - "search.engine.local.project_docs.weaknesses": { - "category": "project_docs", - "description": "Limitations: May be outdated if docs not maintained, limited scope to specific projects.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Weaknesses", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "no external information", - "limited to organizational knowledge" - ], - "visible": true - }, "search.engine.web.pubmed.display_name": { "category": "pubmed", "description": "Display name to use in the U.I. for this search engine.", @@ -3277,243 +2670,6 @@ ], "visible": true }, - "search.engine.local.research_papers.cache_dir": { - "category": "research_papers", - "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Cache Dir", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": null, - "visible": true - }, - "search.engine.local.research_papers.chunk_overlap": { - "category": "research_papers", - "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.", - "editable": true, - "max_value": null, - "min_value": 0, - "name": "Chunk Overlap", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 150, - "visible": true - }, - "search.engine.local.research_papers.chunk_size": { - "category": "research_papers", - "description": "Maximum characters per chunk when splitting papers for RAG indexing. Default (800) balances context and precision.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Chunk Size", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 800, - "visible": true - }, - "search.engine.local.research_papers.description": { - "category": "research_papers", - "description": "Human-readable description of this document collection shown in the UI.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Description", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Academic research papers and articles", - "visible": true - }, - "search.engine.local.research_papers.embedding_device": { - "category": "research_papers", - "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Device", - "options": [ - { - "label": "CPU", - "value": "cpu" - }, - { - "label": "CUDA", - "value": "cuda" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "cpu", - "visible": true - }, - "search.engine.local.research_papers.embedding_model": { - "category": "research_papers", - "description": "Model for generating text embeddings. Consider 'allenai/specter' for academic papers if available.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "all-MiniLM-L6-v2", - "visible": true - }, - "search.engine.local.research_papers.embedding_model_type": { - "category": "research_papers", - "description": "Model provider to use for generating document embeddings.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model Type", - "options": [ - { - "label": "SentenceTransformers", - "value": "sentence_transformers" - }, - { - "label": "Ollama", - "value": "ollama" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "sentence_transformers", - "visible": true - }, - "search.engine.local.research_papers.enabled": { - "category": "research_papers", - "description": "Enable this document collection for searching. Disable if you don't have local research papers.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Enabled", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": true, - "visible": true - }, - "search.engine.local.research_papers.max_filtered_results": { - "category": "research_papers", - "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Max Filtered Results", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": 5, - "visible": true - }, - "search.engine.local.research_papers.max_results": { - "category": "research_papers", - "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Max Results", - "options": null, - "step": 1, - "type": "SEARCH", - "ui_element": "number", - "value": 20, - "visible": true - }, - "search.engine.local.research_papers.name": { - "category": "research_papers", - "description": "Internal identifier for this collection. Used in logs and configuration.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Name", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Research Papers", - "visible": true - }, - "search.engine.local.research_papers.paths": { - "category": "research_papers", - "description": "File paths containing academic papers. Supports PDFs and text formats; directories are indexed recursively.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Paths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "/local_collections/research_papers/" - ], - "visible": true - }, - "search.engine.local.research_papers.reliability": { - "category": "research_papers", - "description": "Reliability score (0-1). Research papers rated high (0.95) as they are peer-reviewed academic content.", - "editable": true, - "max_value": 1.0, - "min_value": 0.0, - "name": "Reliability", - "options": null, - "step": 0.05, - "type": "SEARCH", - "ui_element": "range", - "value": 0.85, - "visible": true - }, - "search.engine.local.research_papers.strengths": { - "category": "research_papers", - "description": "Advantages: Access to peer-reviewed academic content, scientific papers, and scholarly research in your collection.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Strengths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "academic research", - "scientific papers", - "scholarly content" - ], - "visible": true - }, - "search.engine.local.research_papers.weaknesses": { - "category": "research_papers", - "description": "Limitations: Limited to papers in your collection, may be outdated if not regularly updated.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Weaknesses", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "potentially outdated", - "limited to collected papers" - ], - "visible": true - }, "search.engine.web.searxng.display_name": { "category": "searxng", "description": "Display name to use in the U.I. for this search engine.", @@ -5132,62 +4288,6 @@ "value": false, "visible": true }, - "search.engine.local.local_all.use_in_auto_search": { - "category": "local_all", - "description": "Include local documents in auto search mode", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Include in Auto Search", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": true, - "visible": true - }, - "search.engine.local.personal_notes.use_in_auto_search": { - "category": "personal_notes", - "description": "Include personal notes in auto search mode", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Include in Auto Search", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": false, - "visible": true - }, - "search.engine.local.project_docs.use_in_auto_search": { - "category": "project_docs", - "description": "Include project documents in auto search mode", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Include in Auto Search", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": false, - "visible": true - }, - "search.engine.local.research_papers.use_in_auto_search": { - "category": "research_papers", - "description": "Include research papers in auto search mode", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Include in Auto Search", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": false, - "visible": true - }, "app.warnings.dismiss_high_context": { "category": "warnings", "description": "Dismiss warnings about high context window sizes that may cause memory issues", diff --git a/src/local_deep_research/research_library/routes/rag_routes.py b/src/local_deep_research/research_library/routes/rag_routes.py index 3235541f3..d237a9632 100644 --- a/src/local_deep_research/research_library/routes/rag_routes.py +++ b/src/local_deep_research/research_library/routes/rag_routes.py @@ -2154,7 +2154,7 @@ def _get_rag_service_for_thread( Create RAG service for use in background threads (no Flask context). """ from ...database.session_context import get_user_db_session - from ...web_search_engines.engines.search_engine_local import ( + from ...web_search_engines.engines.local_embedding_manager import ( LocalEmbeddingManager, ) import json @@ -2263,8 +2263,6 @@ def _get_rag_service_for_thread( embedding_manager = LocalEmbeddingManager( embedding_model=embedding_model, embedding_model_type=embedding_provider, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, settings_snapshot=settings_snapshot, ) embedding_manager.db_password = db_password diff --git a/src/local_deep_research/research_library/services/library_rag_service.py b/src/local_deep_research/research_library/services/library_rag_service.py index ca490c3c7..c8f8ab034 100644 --- a/src/local_deep_research/research_library/services/library_rag_service.py +++ b/src/local_deep_research/research_library/services/library_rag_service.py @@ -29,7 +29,7 @@ from ...database.models.library import ( from ...database.session_context import get_user_db_session from ...utilities.type_utils import to_bool from ...embeddings.splitters import get_text_splitter -from ...web_search_engines.engines.search_engine_local import ( +from ...web_search_engines.engines.local_embedding_manager import ( LocalEmbeddingManager, ) from ...security.file_integrity import FileIntegrityManager, FAISSIndexVerifier @@ -126,8 +126,6 @@ class LibraryRAGService: self.embedding_manager = LocalEmbeddingManager( embedding_model=embedding_model, embedding_model_type=embedding_provider, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, settings_snapshot=settings_snapshot, ) diff --git a/src/local_deep_research/security/module_whitelist.py b/src/local_deep_research/security/module_whitelist.py index 7bc14cd04..b15d754e4 100644 --- a/src/local_deep_research/security/module_whitelist.py +++ b/src/local_deep_research/security/module_whitelist.py @@ -40,8 +40,7 @@ ALLOWED_MODULE_PATHS: frozenset[str] = frozenset( ".engines.search_engine_google_pse", ".engines.search_engine_guardian", ".engines.search_engine_library", - ".engines.search_engine_local", - ".engines.search_engine_local_all", + ".engines.local_embedding_manager", ".engines.search_engine_mojeek", ".engines.search_engine_nasa_ads", ".engines.search_engine_openalex", @@ -81,8 +80,6 @@ ALLOWED_CLASS_NAMES: frozenset[str] = frozenset( "GooglePSESearchEngine", "GuardianSearchEngine", "LibraryRAGSearchEngine", - "LocalAllSearchEngine", - "LocalSearchEngine", "MetaSearchEngine", "MojeekSearchEngine", "NasaAdsSearchEngine", diff --git a/src/local_deep_research/web/api.py b/src/local_deep_research/web/api.py index e4adbef4d..e9600d8f2 100644 --- a/src/local_deep_research/web/api.py +++ b/src/local_deep_research/web/api.py @@ -423,7 +423,7 @@ def api_analyze_documents(): POST /api/v1/analyze_documents { "query": "neural networks in medicine", - "collection_name": "research_papers", # Required: local collection name + "collection_name": "my_collection", # Required: local collection name "max_results": 20, # Optional: max results to return "temperature": 0.7, # Optional: LLM temperature "force_reindex": false # Optional: force reindexing diff --git a/src/local_deep_research/web_search_engines/engines/local_embedding_manager.py b/src/local_deep_research/web_search_engines/engines/local_embedding_manager.py new file mode 100644 index 000000000..4f14a9bee --- /dev/null +++ b/src/local_deep_research/web_search_engines/engines/local_embedding_manager.py @@ -0,0 +1,290 @@ +import hashlib +import threading +import uuid +from datetime import UTC, datetime +from typing import Any, Dict, List, Optional + +from langchain_community.embeddings import ( + HuggingFaceEmbeddings, +) +from langchain_core.documents import Document +from loguru import logger + +from ...database.models.library import DocumentChunk +from ...database.session_context import get_user_db_session +from ...utilities.url_utils import normalize_url + + +class LocalEmbeddingManager: + """Handles embedding generation and storage for local document search""" + + def __init__( + self, + embedding_model: str = "all-MiniLM-L6-v2", + embedding_device: str = "cpu", + embedding_model_type: str = "sentence_transformers", # or 'ollama' + ollama_base_url: Optional[str] = None, + settings_snapshot: Optional[Dict[str, Any]] = None, + ): + """ + Initialize the embedding manager for local document search. + + Args: + embedding_model: Name of the embedding model to use + embedding_device: Device to run embeddings on ('cpu' or 'cuda') + embedding_model_type: Type of embedding model ('sentence_transformers' or 'ollama') + ollama_base_url: Base URL for Ollama API if using ollama embeddings + settings_snapshot: Optional settings snapshot for background threads + """ + + self.embedding_model = embedding_model + self.embedding_device = embedding_device + self.embedding_model_type = embedding_model_type + self.ollama_base_url = ollama_base_url + self.settings_snapshot = settings_snapshot or {} + + # Username for database access (extracted from settings if available) + self.username = ( + settings_snapshot.get("_username") if settings_snapshot else None + ) + # Password for encrypted database access (can be set later) + self.db_password = None + + # Initialize the embedding model (with lock for thread-safe lazy init) + self._embeddings = None + self._embedding_lock = threading.Lock() + + # Vector store cache + self.vector_stores = {} + + # Track if this manager has been closed + self._closed = False + + def close(self): + """Release embedding model resources.""" + if self._closed: + return + self._closed = True + # Clear embedding model reference to allow garbage collection + self._embeddings = None + # Clear vector store cache + self.vector_stores.clear() + logger.debug("LocalEmbeddingManager closed") + + def __enter__(self): + """Context manager entry.""" + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + """Context manager exit - ensures resources are released.""" + self.close() + return False + + @property + def embeddings(self): + """ + Lazily initialize embeddings when first accessed. + This allows the LocalEmbeddingManager to be created without + immediately loading models, which is helpful when no local search is performed. + + Uses double-checked locking to ensure thread-safe initialization. + Concurrent SentenceTransformer model loading causes meta tensor errors + in PyTorch when multiple threads call model.to(device) simultaneously. + """ + if self._embeddings is None: + with self._embedding_lock: + if self._embeddings is None: + logger.info("Initializing embeddings on first use") + self._embeddings = self._initialize_embeddings() + return self._embeddings + + def _initialize_embeddings(self): + """Initialize the embedding model based on configuration""" + try: + # Use the new unified embedding system + from ...embeddings import get_embeddings + + # Prepare kwargs for provider-specific parameters + kwargs = {} + + # Add device for sentence transformers + if self.embedding_model_type == "sentence_transformers": + kwargs["device"] = self.embedding_device + + # Add base_url for ollama if specified + if self.embedding_model_type == "ollama" and self.ollama_base_url: + kwargs["base_url"] = normalize_url(self.ollama_base_url) + + logger.info( + f"Initializing embeddings with provider={self.embedding_model_type}, model={self.embedding_model}" + ) + + return get_embeddings( + provider=self.embedding_model_type, + model=self.embedding_model, + settings_snapshot=self.settings_snapshot, + **kwargs, + ) + except Exception: + logger.exception("Error initializing embeddings") + logger.warning( + "Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2" + ) + return HuggingFaceEmbeddings( + model_name="sentence-transformers/all-MiniLM-L6-v2" + ) + + def _store_chunks_to_db( + self, + chunks: List[Document], + collection_name: str, + source_path: Optional[str] = None, + source_id: Optional[int] = None, + source_type: str = "local_file", + ) -> List[str]: + """ + Store document chunks in the database. + + Args: + chunks: List of LangChain Document chunks + collection_name: Name of the collection (e.g., 'personal_notes', 'library') + source_path: Path to source file (for local files) + source_id: ID of source document (for library documents) + source_type: Type of source ('local_file' or 'library') + + Returns: + List of chunk embedding IDs (UUIDs) for FAISS mapping + """ + if not self.username: + logger.warning( + "No username available, cannot store chunks in database" + ) + return [] + + chunk_ids = [] + + try: + with get_user_db_session( + self.username, self.db_password + ) as session: + for idx, chunk in enumerate(chunks): + # Generate unique hash for chunk + chunk_text = chunk.page_content + chunk_hash = hashlib.sha256(chunk_text.encode()).hexdigest() + + # Generate unique embedding ID + embedding_id = uuid.uuid4().hex + + # Extract metadata + metadata = chunk.metadata or {} + document_title = metadata.get( + "filename", metadata.get("title", "Unknown") + ) + + # Calculate word count + word_count = len(chunk_text.split()) + + # Get character positions from metadata if available + start_char = metadata.get("start_char", 0) + end_char = metadata.get("end_char", len(chunk_text)) + + # Check if chunk already exists + existing_chunk = ( + session.query(DocumentChunk) + .filter_by(chunk_hash=chunk_hash) + .first() + ) + + if existing_chunk: + # Update existing chunk + existing_chunk.last_accessed = datetime.now(UTC) + chunk_ids.append(existing_chunk.embedding_id) + logger.debug( + f"Chunk already exists, reusing: {existing_chunk.embedding_id}" + ) + else: + # Create new chunk + db_chunk = DocumentChunk( + chunk_hash=chunk_hash, + source_type=source_type, + source_id=source_id, + source_path=str(source_path) + if source_path + else None, + collection_name=collection_name, + chunk_text=chunk_text, + chunk_index=idx, + start_char=start_char, + end_char=end_char, + word_count=word_count, + embedding_id=embedding_id, + embedding_model=self.embedding_model, + embedding_model_type=self.embedding_model_type, + document_title=document_title, + document_metadata=metadata, + ) + session.add(db_chunk) + chunk_ids.append(embedding_id) + + session.commit() + logger.info( + f"Stored {len(chunk_ids)} chunks to database for collection '{collection_name}'" + ) + + except Exception: + logger.exception( + f"Error storing chunks to database for collection '{collection_name}'" + ) + return [] + + return chunk_ids + + def _delete_chunks_from_db( + self, + collection_name: str, + source_path: Optional[str] = None, + source_id: Optional[int] = None, + ) -> int: + """ + Delete chunks from database. + + Args: + collection_name: Name of the collection + source_path: Path to source file (for local files) + source_id: ID of source document (for library documents) + + Returns: + Number of chunks deleted + """ + if not self.username: + logger.warning( + "No username available, cannot delete chunks from database" + ) + return 0 + + try: + with get_user_db_session( + self.username, self.db_password + ) as session: + query = session.query(DocumentChunk).filter_by( + collection_name=collection_name + ) + + if source_path: + query = query.filter_by(source_path=str(source_path)) + if source_id: + query = query.filter_by(source_id=source_id) + + count = query.delete() + session.commit() + + logger.info( + f"Deleted {count} chunks from database for collection '{collection_name}'" + ) + return count + + except Exception: + logger.exception( + f"Error deleting chunks from database for collection '{collection_name}'" + ) + return 0 diff --git a/src/local_deep_research/web_search_engines/engines/search_engine_local.py b/src/local_deep_research/web_search_engines/engines/search_engine_local.py deleted file mode 100644 index 9f3d503a2..000000000 --- a/src/local_deep_research/web_search_engines/engines/search_engine_local.py +++ /dev/null @@ -1,1467 +0,0 @@ -import hashlib -import json -import os -import threading -import time -import uuid -from concurrent.futures import ProcessPoolExecutor -from datetime import UTC, datetime -from pathlib import Path -from typing import Any, Dict, Iterable, List, Optional - -import numpy as np -from faiss import IndexFlatL2 -from langchain_community.docstore.in_memory import InMemoryDocstore -from langchain_community.document_loaders import TextLoader -from langchain_community.embeddings import ( - HuggingFaceEmbeddings, -) -from langchain_community.vectorstores import FAISS -from langchain_core.document_loaders import BaseLoader -from langchain_core.documents import Document -from langchain_core.language_models import BaseLLM -from langchain_text_splitters import RecursiveCharacterTextSplitter -from loguru import logger - -from ...config import search_config -from ...config.paths import get_cache_directory -from ...database.models.library import DocumentChunk -from ...database.session_context import get_user_db_session -from ...document_loaders import get_loader_for_path, is_extension_supported -from ...utilities.url_utils import normalize_url -from ..search_engine_base import BaseSearchEngine - - -def _get_file_loader(file_path: str) -> Optional[BaseLoader]: - """Get an appropriate document loader for a file based on its extension. - - Uses the centralized document_loaders registry which supports 35+ file formats. - """ - file_path_obj = Path(file_path) - extension = file_path_obj.suffix.lower() - - # Check if extension is supported by the registry - if is_extension_supported(extension): - loader = get_loader_for_path(file_path) - if loader: - return loader - - # Fallback to TextLoader for unknown extensions - logger.warning(f"Unknown file extension for {file_path}, trying TextLoader") - try: - return TextLoader( - str(file_path), encoding="utf-8", autodetect_encoding=True - ) - except Exception: - logger.exception(f"Error creating loader for {file_path}") - return None - - -def _load_document(file_path: Path) -> List[Document]: - """ - Loads documents from a file. - - Args: - file_path: The path to the document to load. - - Returns: - The loaded documents, or an empty list if it failed to load. - - """ - # Get a loader for this file - loader = _get_file_loader(str(file_path)) - - if loader is None: - # No loader for this filetype. - return [] - - try: - # Load the document - docs = loader.load() - - # Add source path metadata and ID. - for doc in docs: - doc.metadata["source"] = str(file_path) - doc.metadata["filename"] = file_path.name - - except Exception: - logger.exception(f"Error loading {file_path}") - return [] - - return docs - - -class LocalEmbeddingManager: - """Handles embedding generation and storage for local document search""" - - def __init__( - self, - embedding_model: str = "all-MiniLM-L6-v2", - embedding_device: str = "cpu", - embedding_model_type: str = "sentence_transformers", # or 'ollama' - ollama_base_url: Optional[str] = None, - chunk_size: int = 1000, - chunk_overlap: int = 200, - cache_dir: Optional[str] = None, - settings_snapshot: Optional[Dict[str, Any]] = None, - ): - """ - Initialize the embedding manager for local document search. - - Args: - embedding_model: Name of the embedding model to use - embedding_device: Device to run embeddings on ('cpu' or 'cuda') - embedding_model_type: Type of embedding model ('sentence_transformers' or 'ollama') - ollama_base_url: Base URL for Ollama API if using ollama embeddings - chunk_size: Size of text chunks for splitting documents - chunk_overlap: Overlap between chunks - cache_dir: Directory to store embedding cache and index. - If None, uses the app's configured cache directory. - settings_snapshot: Optional settings snapshot for background threads - """ - - self.embedding_model = embedding_model - self.embedding_device = embedding_device - self.embedding_model_type = embedding_model_type - self.ollama_base_url = ollama_base_url - self.chunk_size = chunk_size - self.chunk_overlap = chunk_overlap - # Use configured cache directory if not specified - if cache_dir is None: - self.cache_dir = get_cache_directory() / "local_search" - else: - self.cache_dir = Path(cache_dir) - self.settings_snapshot = settings_snapshot or {} - - # Username for database access (extracted from settings if available) - self.username = ( - settings_snapshot.get("_username") if settings_snapshot else None - ) - # Password for encrypted database access (can be set later) - self.db_password = None - - # Create cache directory if it doesn't exist - self.cache_dir.mkdir(parents=True, exist_ok=True) - - # Initialize the embedding model (with lock for thread-safe lazy init) - self._embeddings = None - self._embedding_lock = threading.Lock() - - # Initialize the text splitter - self.text_splitter = RecursiveCharacterTextSplitter( - chunk_size=chunk_size, chunk_overlap=chunk_overlap - ) - - # Track indexed folders and their metadata - self.indexed_folders = self._load_indexed_folders() - - # Vector store cache - self.vector_stores = {} - - # Track if this manager has been closed - self._closed = False - - def close(self): - """Release embedding model resources.""" - if self._closed: - return - self._closed = True - # Clear embedding model reference to allow garbage collection - self._embeddings = None - # Clear vector store cache - self.vector_stores.clear() - logger.debug("LocalEmbeddingManager closed") - - def __enter__(self): - """Context manager entry.""" - return self - - def __exit__(self, exc_type, exc_val, exc_tb): - """Context manager exit - ensures resources are released.""" - self.close() - return False - - @property - def embeddings(self): - """ - Lazily initialize embeddings when first accessed. - This allows the LocalEmbeddingManager to be created without - immediately loading models, which is helpful when no local search is performed. - - Uses double-checked locking to ensure thread-safe initialization. - Concurrent SentenceTransformer model loading causes meta tensor errors - in PyTorch when multiple threads call model.to(device) simultaneously. - """ - if self._embeddings is None: - with self._embedding_lock: - if self._embeddings is None: - logger.info("Initializing embeddings on first use") - self._embeddings = self._initialize_embeddings() - return self._embeddings - - def _initialize_embeddings(self): - """Initialize the embedding model based on configuration""" - try: - # Use the new unified embedding system - from ...embeddings import get_embeddings - - # Prepare kwargs for provider-specific parameters - kwargs = {} - - # Add device for sentence transformers - if self.embedding_model_type == "sentence_transformers": - kwargs["device"] = self.embedding_device - - # Add base_url for ollama if specified - if self.embedding_model_type == "ollama" and self.ollama_base_url: - kwargs["base_url"] = normalize_url(self.ollama_base_url) - - logger.info( - f"Initializing embeddings with provider={self.embedding_model_type}, model={self.embedding_model}" - ) - - return get_embeddings( - provider=self.embedding_model_type, - model=self.embedding_model, - settings_snapshot=self.settings_snapshot, - **kwargs, - ) - except Exception: - logger.exception("Error initializing embeddings") - logger.warning( - "Falling back to HuggingFaceEmbeddings with all-MiniLM-L6-v2" - ) - return HuggingFaceEmbeddings( - model_name="sentence-transformers/all-MiniLM-L6-v2" - ) - - def _store_chunks_to_db( - self, - chunks: List[Document], - collection_name: str, - source_path: Optional[str] = None, - source_id: Optional[int] = None, - source_type: str = "local_file", - ) -> List[str]: - """ - Store document chunks in the database. - - Args: - chunks: List of LangChain Document chunks - collection_name: Name of the collection (e.g., 'personal_notes', 'library') - source_path: Path to source file (for local files) - source_id: ID of source document (for library documents) - source_type: Type of source ('local_file' or 'library') - - Returns: - List of chunk embedding IDs (UUIDs) for FAISS mapping - """ - if not self.username: - logger.warning( - "No username available, cannot store chunks in database" - ) - return [] - - chunk_ids = [] - - try: - with get_user_db_session( - self.username, self.db_password - ) as session: - for idx, chunk in enumerate(chunks): - # Generate unique hash for chunk - chunk_text = chunk.page_content - chunk_hash = hashlib.sha256(chunk_text.encode()).hexdigest() - - # Generate unique embedding ID - embedding_id = uuid.uuid4().hex - - # Extract metadata - metadata = chunk.metadata or {} - document_title = metadata.get( - "filename", metadata.get("title", "Unknown") - ) - - # Calculate word count - word_count = len(chunk_text.split()) - - # Get character positions from metadata if available - start_char = metadata.get("start_char", 0) - end_char = metadata.get("end_char", len(chunk_text)) - - # Check if chunk already exists - existing_chunk = ( - session.query(DocumentChunk) - .filter_by(chunk_hash=chunk_hash) - .first() - ) - - if existing_chunk: - # Update existing chunk - existing_chunk.last_accessed = datetime.now(UTC) - chunk_ids.append(existing_chunk.embedding_id) - logger.debug( - f"Chunk already exists, reusing: {existing_chunk.embedding_id}" - ) - else: - # Create new chunk - db_chunk = DocumentChunk( - chunk_hash=chunk_hash, - source_type=source_type, - source_id=source_id, - source_path=str(source_path) - if source_path - else None, - collection_name=collection_name, - chunk_text=chunk_text, - chunk_index=idx, - start_char=start_char, - end_char=end_char, - word_count=word_count, - embedding_id=embedding_id, - embedding_model=self.embedding_model, - embedding_model_type=self.embedding_model_type, - document_title=document_title, - document_metadata=metadata, - ) - session.add(db_chunk) - chunk_ids.append(embedding_id) - - session.commit() - logger.info( - f"Stored {len(chunk_ids)} chunks to database for collection '{collection_name}'" - ) - - except Exception: - logger.exception( - f"Error storing chunks to database for collection '{collection_name}'" - ) - return [] - - return chunk_ids - - def _load_chunks_from_db( - self, chunk_ids: List[str], username: Optional[str] = None - ) -> List[Dict[str, Any]]: - """ - Load chunks from database by their embedding IDs. - - Args: - chunk_ids: List of embedding IDs to load - username: Username for database access (uses self.username if not provided) - - Returns: - List of chunk dictionaries with content and metadata - """ - username = username or self.username - if not username: - logger.warning( - "No username available, cannot load chunks from database" - ) - return [] - - chunks = [] - - try: - with get_user_db_session(username) as session: - db_chunks = ( - session.query(DocumentChunk) - .filter(DocumentChunk.embedding_id.in_(chunk_ids)) - .all() - ) - - for db_chunk in db_chunks: - # Update last accessed time - db_chunk.last_accessed = datetime.now(UTC) - - chunks.append( - { - "id": db_chunk.embedding_id, - "content": db_chunk.chunk_text, - "metadata": { - "source_type": db_chunk.source_type, - "source_path": db_chunk.source_path, - "source_id": db_chunk.source_id, - "collection": db_chunk.collection_name, - "chunk_index": db_chunk.chunk_index, - "word_count": db_chunk.word_count, - "title": db_chunk.document_title, - **db_chunk.document_metadata, - }, - } - ) - - session.commit() # Commit the last_accessed updates - - except Exception: - logger.exception("Error loading chunks from database") - return [] - - return chunks - - def _delete_chunks_from_db( - self, - collection_name: str, - source_path: Optional[str] = None, - source_id: Optional[int] = None, - ) -> int: - """ - Delete chunks from database. - - Args: - collection_name: Name of the collection - source_path: Path to source file (for local files) - source_id: ID of source document (for library documents) - - Returns: - Number of chunks deleted - """ - if not self.username: - logger.warning( - "No username available, cannot delete chunks from database" - ) - return 0 - - try: - with get_user_db_session( - self.username, self.db_password - ) as session: - query = session.query(DocumentChunk).filter_by( - collection_name=collection_name - ) - - if source_path: - query = query.filter_by(source_path=str(source_path)) - if source_id: - query = query.filter_by(source_id=source_id) - - count = query.delete() - session.commit() - - logger.info( - f"Deleted {count} chunks from database for collection '{collection_name}'" - ) - return count - - except Exception: - logger.exception( - f"Error deleting chunks from database for collection '{collection_name}'" - ) - return 0 - - def _load_or_create_vector_store(self): - """Load the vector store from disk or create it if needed""" - vector_store_path = self._get_vector_store_path() - - # Check if vector store exists and is up to date - if vector_store_path.exists() and not self._check_folders_modified(): - logger.info( - f"Loading existing vector store from {vector_store_path}" - ) - try: - vector_store = FAISS.load_local( - str(vector_store_path), - self.embeddings, - allow_dangerous_deserialization=True, - ) - - # Add this code to show document count - doc_count = len(vector_store.index_to_docstore_id) - logger.info(f"Loaded index with {doc_count} document chunks") - - return vector_store - except Exception: - logger.exception("Error loading vector store") - logger.info("Will create a new vector store") - - # Create a new vector store - return self._create_vector_store() - - def _load_indexed_folders(self) -> Dict[str, Dict[str, Any]]: - """Load metadata about indexed folders from disk""" - index_metadata_path = self.cache_dir / "index_metadata.json" - - if index_metadata_path.exists(): - try: - with open(index_metadata_path, "r") as f: - return json.load(f) - except Exception: - logger.exception("Error loading index metadata") - - return {} - - def _save_indexed_folders(self): - """Save metadata about indexed folders to disk""" - index_metadata_path = self.cache_dir / "index_metadata.json" - - try: - with open(index_metadata_path, "w") as f: - json.dump(self.indexed_folders, f, indent=2) - except Exception: - logger.exception("Error saving index metadata") - - @staticmethod - def get_folder_hash(folder_path: Path) -> str: - """Generate a hash for a folder based on its path""" - # Canonicalize the path so we don't have weird Windows vs. Linux - # problems or issues with trailing slashes. - canonical_folder_path = "/".join(folder_path.parts) - return hashlib.md5( # DevSkim: ignore DS126858 - canonical_folder_path.encode(), usedforsecurity=False - ).hexdigest() - - def _get_index_path(self, folder_path: Path) -> Path: - """Get the path where the index for a specific folder should be stored""" - folder_hash = self.get_folder_hash(folder_path) - return self.cache_dir / f"index_{folder_hash}" - - def _check_folder_modified(self, folder_path: Path) -> bool: - """Check if a folder has been modified since it was last indexed""" - - @staticmethod - def _get_all_files(folder_path: Path) -> Iterable[Path]: - """ - Gets all the files, recursively, in a folder. - - Args: - folder_path: The path to the folder. - - Yields: - Each of the files in the folder. - - """ - for root, _, files in os.walk(folder_path): - for file in files: - yield Path(root) / file - - def _get_modified_files(self, folder_path: Path) -> List[Path]: - """ - Gets the files in a folder that have been modified since it was last - indexed. - - Args: - folder_path: The path to the folder to check. - - Returns: - A list of the files that were modified. - - """ - if not folder_path.exists() or not folder_path.is_dir(): - return [] - - folder_hash = self.get_folder_hash(folder_path) - - if folder_hash not in self.indexed_folders: - # If folder has never been indexed, everything has been modified. - last_indexed = 0 - indexed_files = set() - else: - last_indexed = self.indexed_folders[folder_hash].get( - "last_indexed", 0 - ) - indexed_files = ( - self.indexed_folders[folder_hash] - .get("indexed_files", {}) - .keys() - ) - - # Check if any file in the folder has been modified since last indexing - modified_files = [] - for file_path in self._get_all_files(folder_path): - file_stats = file_path.stat() - if file_stats.st_mtime > last_indexed: - modified_files.append(file_path) - elif str(file_path.relative_to(folder_path)) not in indexed_files: - # This file somehow never got indexed. - modified_files.append(file_path) - - return modified_files - - def _check_config_changed(self, folder_path: Path) -> bool: - """ - Checks if the embedding configuration for a folder has been changed - since it was last indexed. - """ - folder_hash = self.get_folder_hash(folder_path) - - if folder_hash not in self.indexed_folders: - # It hasn't been indexed at all. That's a new configuration, - # technically. - return True - - embedding_config = self.indexed_folders[folder_hash] - chunk_size = int(embedding_config.get("chunk_size", 0)) - chunk_overlap = int(embedding_config.get("chunk_overlap", 0)) - embedding_model = embedding_config.get("embedding_model", "") - - if (chunk_size, chunk_overlap, embedding_model) != ( - self.chunk_size, - self.chunk_overlap, - self.embedding_model, - ): - logger.info( - "Embedding configuration has changed, re-indexing folder." - ) - return True - return False - - def index_folder( - self, folder_path: str, force_reindex: bool = False - ) -> bool: - """ - Index all documents in a folder for vector search. - - Args: - folder_path: Path to the folder to index - force_reindex: Whether to force reindexing even if unchanged - - Returns: - bool: True if indexing was successful, False otherwise - """ - folder_path = Path(folder_path) - - # Validate folder - if not folder_path.exists(): - logger.error(f"Folder not found: {folder_path}") - return False - - if not folder_path.is_dir(): - logger.error(f"Path is not a directory: {folder_path}") - return False - - folder_str = str(folder_path) - folder_hash = self.get_folder_hash(folder_path) - index_path = self._get_index_path(folder_path) - - if force_reindex or self._check_config_changed(folder_path): - logger.info(f"Re-indexing entire folder: {folder_path}") - modified_files = list(self._get_all_files(folder_path)) - else: - # Just re-index the modified files if we can get away with it. - modified_files = self._get_modified_files(folder_path) - logger.info(f"Re-indexing {len(modified_files)} modified files...") - - # Load the vector store from disk if not already loaded - if folder_hash not in self.vector_stores and index_path.exists(): - try: - self.vector_stores[folder_hash] = FAISS.load_local( - str(index_path), - self.embeddings, - allow_dangerous_deserialization=True, - ) - logger.info(f"Loaded index for {folder_path} from disk") - except Exception: - logger.exception(f"Error loading index for {folder_path}") - # If loading fails, force reindexing - force_reindex = True - - logger.info(f"Indexing folder: {folder_path}") - start_time = time.time() - - # Find documents to index - all_docs = [] - - # Remove hidden files and directories. - modified_files = [ - p - for p in modified_files - if not p.name.startswith(".") - and not any(part.startswith(".") for part in p.parts) - ] - # Index them. - with ProcessPoolExecutor() as executor: - all_docs_nested = executor.map(_load_document, modified_files) - # Flatten the result. - for docs in all_docs_nested: - all_docs.extend(docs) - - if force_reindex or folder_hash not in self.vector_stores: - logger.info(f"Creating new index for {folder_path}") - # Embed a test query to figure out embedding length. - test_embedding = self.embeddings.embed_query("hello world") - index = IndexFlatL2(len(test_embedding)) - # Use minimal docstore - chunks are stored in database - self.vector_stores[folder_hash] = FAISS( - self.embeddings, - index=index, - docstore=InMemoryDocstore(), # Minimal - just for FAISS compatibility - index_to_docstore_id={}, - normalize_L2=True, - ) - - # Split documents into chunks - logger.info(f"Splitting {len(all_docs)} documents into chunks") - splits = self.text_splitter.split_documents(all_docs) - logger.info( - f"Created {len(splits)} chunks from {len(modified_files)} files" - ) - - # Store chunks in database and get embedding IDs - embedding_ids = [] - if splits: - logger.info(f"Storing {len(splits)} chunks in database") - # Get collection name from folder path (last folder name) - collection_name = folder_path.name - - # Store chunks to database - embedding_ids = self._store_chunks_to_db( - chunks=splits, - collection_name=collection_name, - source_type="local_file", - ) - - logger.info(f"Adding {len(splits)} chunks to FAISS index") - # Add embeddings to FAISS using the database-generated IDs - self.vector_stores[folder_hash].add_documents( - splits, ids=embedding_ids - ) - - # Update indexing time for individual files. - index_time = time.time() - indexed_files = {} - if folder_hash in self.indexed_folders: - indexed_files = ( - self.indexed_folders[folder_hash] - .get("indexed_files", {}) - .copy() - ) - for embedding_id, split in zip(embedding_ids, splits, strict=False): - split_source = str( - Path(split.metadata["source"]).relative_to(folder_path) - ) - id_list = indexed_files.setdefault(split_source, []) - id_list.append(embedding_id) - - # Check for any files that were removed and remove them from the - # vector store and database. - delete_ids = [] - delete_paths = [] - for relative_path, chunk_ids in indexed_files.items(): - if not (folder_path / Path(relative_path)).exists(): - delete_ids.extend(chunk_ids) - delete_paths.append(relative_path) - if delete_ids: - logger.info( - f"Deleting {len(delete_paths)} non-existent files from the " - f"index and database." - ) - # Delete from FAISS index - self.vector_stores[folder_hash].delete(delete_ids) - - # Delete from database - collection_name = folder_path.name - for delete_path in delete_paths: - full_path = str(folder_path / delete_path) - deleted_count = self._delete_chunks_from_db( - collection_name=collection_name, - source_path=full_path, - ) - logger.debug( - f"Deleted {deleted_count} chunks for {delete_path} from database" - ) - for path in delete_paths: - del indexed_files[path] - - # Save the vector store to disk - logger.info(f"Saving index to {index_path}") - self.vector_stores[folder_hash].save_local(str(index_path)) - - # Update metadata - self.indexed_folders[folder_hash] = { - "path": folder_str, - "last_indexed": index_time, - "file_count": len(modified_files), - "chunk_count": len(splits), - "embedding_model": self.embedding_model, - "chunk_size": self.chunk_size, - "chunk_overlap": self.chunk_overlap, - "indexed_files": indexed_files, - } - - # Save updated metadata - self._save_indexed_folders() - - elapsed_time = time.time() - start_time - logger.info( - f"Indexed {len(modified_files)} files in {elapsed_time:.2f} seconds" - ) - - return True - - def search( - self, - query: str, - folder_paths: List[str], - limit: int = 10, - score_threshold: float = 0.0, - ) -> List[Dict[str, Any]]: - """ - Search for documents relevant to a query across specified folders. - - Args: - query: The search query - folder_paths: List of folder paths to search in - limit: Maximum number of results to return - score_threshold: Minimum similarity score threshold - - Returns: - List of results with document content and metadata - """ - folder_paths = [Path(p) for p in folder_paths] - - # Add detailed debugging for each folder - for folder_path in folder_paths: - folder_hash = self.get_folder_hash(folder_path) - index_path = self._get_index_path(folder_path) - - logger.info(f"Diagnostic for {folder_path}:") - logger.info(f" - Folder hash: {folder_hash}") - logger.info(f" - Index path: {index_path}") - logger.info(f" - Index exists on disk: {index_path.exists()}") - logger.info( - f" - Is in indexed_folders: {folder_hash in self.indexed_folders}" - ) - - if folder_hash in self.indexed_folders: - meta = self.indexed_folders[folder_hash] - logger.info( - f" - Metadata: file_count={meta.get('file_count', 0)}, chunk_count={meta.get('chunk_count', 0)}" - ) - - # Validate folders exist - valid_folder_paths = [] - for path in folder_paths: - if path.exists() and path.is_dir(): - valid_folder_paths.append(path) - else: - logger.warning( - f"Skipping non-existent folder in search: {path}" - ) - - # If no valid folders, return empty results - if not valid_folder_paths: - logger.warning(f"No valid folders to search among: {folder_paths}") - return [] - - all_results = [] - - for folder_path in valid_folder_paths: - folder_hash = self.get_folder_hash(folder_path) - - # Skip folders that haven't been indexed - if folder_hash not in self.indexed_folders: - logger.warning(f"Folder {folder_path} has not been indexed") - continue - - # Make sure the vector store is loaded - if folder_hash not in self.vector_stores: - index_path = self._get_index_path(folder_path) - try: - self.vector_stores[folder_hash] = FAISS.load_local( - str(index_path), - self.embeddings, - allow_dangerous_deserialization=True, - ) - except Exception: - logger.exception(f"Error loading index for {folder_path}") - continue - - # Search in this folder - vector_store = self.vector_stores[folder_hash] - - try: - # Get query embedding - query_vector = self.embeddings.embed_query(query) - - # Search FAISS index for similar vectors - # Returns: (distances, indices) where indices are FAISS internal indices - distances, indices = vector_store.index.search( - np.array([query_vector], dtype=np.float32), limit - ) - - # Convert distances to similarity scores (L2 distance -> similarity) - # For L2: smaller distance = more similar - # Convert to similarity: 1 / (1 + distance) - similarities = 1 / (1 + distances[0]) - - # Get embedding IDs from FAISS mapping - embedding_ids = [] - valid_indices = [] - for idx, faiss_idx in enumerate(indices[0]): - if faiss_idx == -1: # FAISS returns -1 for empty results - continue - if faiss_idx in vector_store.index_to_docstore_id: - embedding_id = vector_store.index_to_docstore_id[ - faiss_idx - ] - embedding_ids.append(embedding_id) - valid_indices.append(idx) - - # Load chunks from database - if embedding_ids: - db_chunks = self._load_chunks_from_db( - embedding_ids, self.username - ) - - # Create results from database chunks - for idx, chunk in zip(valid_indices, db_chunks): - similarity = float(similarities[idx]) - - # Skip results below the threshold - if similarity < score_threshold: - continue - - # Extract metadata from chunk - metadata = chunk.get("document_metadata", {}) - if "source" not in metadata and chunk.get( - "source_path" - ): - metadata["source"] = chunk["source_path"] - - result = { - "content": chunk["chunk_text"], - "metadata": metadata, - "similarity": similarity, - "folder": folder_path, - } - - all_results.append(result) - except Exception: - logger.exception(f"Error searching in {folder_path}") - - # Sort by similarity (highest first) - all_results.sort(key=lambda x: x["similarity"], reverse=True) - - # Limit to the requested number - return all_results[:limit] - - def clear_cache(self): - """Clear all cached vector stores from memory (not disk)""" - self.vector_stores.clear() - - def get_indexed_folders_info(self) -> List[Dict[str, Any]]: - """Get information about all indexed folders""" - info = [] - - for folder_hash, metadata in self.indexed_folders.items(): - folder_info = metadata.copy() - - # Add formatted last indexed time - if "last_indexed" in folder_info: - folder_info["last_indexed_formatted"] = datetime.fromtimestamp( - folder_info["last_indexed"] - ).strftime("%Y-%m-%d %H:%M:%S") - - # Check if index file exists - index_path = self._get_index_path(Path(folder_info["path"])) - folder_info["index_exists"] = index_path.exists() - - info.append(folder_info) - - return info - - -class LocalSearchEngine(BaseSearchEngine): - """Local document search engine with two-phase retrieval""" - - def __init__( - self, - paths: List[str], - llm: Optional[BaseLLM] = None, - max_results: int = 10, - max_filtered_results: Optional[int] = None, - embedding_model: str = "all-MiniLM-L6-v2", - embedding_device: str = "cpu", - embedding_model_type: str = "sentence_transformers", - ollama_base_url: Optional[str] = None, - force_reindex: bool = False, - chunk_size: int = 1000, - chunk_overlap: int = 200, - cache_dir: Optional[str] = None, - collections: Optional[Dict[str, Dict[str, Any]]] = None, - name: str = "", - description: str = "", - ): - """ - Initialize the local search engine. - - Args: - paths: List of folder paths to search in - llm: Language model for relevance filtering - max_results: Maximum number of results to return - max_filtered_results: Maximum results after filtering - embedding_model: Name of the embedding model to use - embedding_device: Device to run embeddings on ('cpu' or 'cuda') - embedding_model_type: Type of embedding model - ollama_base_url: Base URL for Ollama API - force_reindex: Whether to force reindexing - chunk_size: Size of text chunks for splitting documents - chunk_overlap: Overlap between chunks - cache_dir: Directory to store embedding cache and index - collections: Dictionary of named collections with paths and descriptions - name: Human-readable name of the collection we are searching. - description: Human-readable description of the collection we are - searching. - """ - # Initialize the base search engine - super().__init__(llm=llm, max_filtered_results=max_filtered_results) - - self.name = name - self.description = description - - # Validate folder paths - self.folder_paths = paths - self.valid_folder_paths = [] - for path_str in paths: - path = Path(path_str) - if path.exists() and path.is_dir(): - self.valid_folder_paths.append(path_str) - else: - logger.warning( - f"Folder not found or is not a directory: {path_str}" - ) - - # If no valid folders, log a clear message - if not self.valid_folder_paths and paths: - logger.warning(f"No valid folders found among: {paths}") - logger.warning( - "This search engine will return no results until valid folders are configured" - ) - - self.max_results = max_results - self.collections = collections or { - "default": {"paths": paths, "description": "Default collection"} - } - - # Initialize the embedding manager with only valid folders - self.embedding_manager = LocalEmbeddingManager( - embedding_model=embedding_model, - embedding_device=embedding_device, - embedding_model_type=embedding_model_type, - ollama_base_url=ollama_base_url, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - cache_dir=cache_dir, - settings_snapshot=self.settings_snapshot, - ) - - # Index all folders - self._index_folders(force_reindex) - - def _index_folders(self, force_reindex: bool = False): - """Index all valid configured folders""" - indexed = [] - failed = [] - skipped = [] - - # Keep track of invalid folders - for folder in self.folder_paths: - if folder not in self.valid_folder_paths: - skipped.append(folder) - continue - - success = self.embedding_manager.index_folder(folder, force_reindex) - if success: - indexed.append(folder) - else: - failed.append(folder) - - if indexed: - logger.info( - f"Successfully indexed {len(indexed)} folders: {', '.join(indexed)}" - ) - - if failed: - logger.warning( - f"Failed to index {len(failed)} folders: {', '.join(failed)}" - ) - - if skipped: - logger.warning( - f"Skipped {len(skipped)} invalid folders: {', '.join(skipped)}" - ) - - def _get_previews( - self, query: str, collection_names: Optional[List[str]] = None - ) -> List[Dict[str, Any]]: - """ - Get preview information for documents matching the query. - - Args: - query: The search query - collection_names: Specific collections to search within (if None, search all) - - Returns: - List of preview dictionaries - """ - # Determine which collections to search - if collection_names: - # Search only in specified collections - collections_to_search = { - name: self.collections[name] - for name in collection_names - if name in self.collections - } - if not collections_to_search: - logger.warning( - f"No valid collections found among: {collection_names}" - ) - return [] - else: - # Search in all collections - collections_to_search = self.collections - - # Extract all folder paths from the collections to search - search_paths = [] - for collection_config in collections_to_search.values(): - if "paths" in collection_config: - search_paths.extend(collection_config["paths"]) - - logger.info( - f"Searching local documents in collections: {list(collections_to_search.keys())}" - ) - - # Filter out invalid paths - valid_search_paths = [ - path for path in search_paths if path in self.valid_folder_paths - ] - - if not valid_search_paths: - logger.warning( - f"No valid folders to search in collections: {list(collections_to_search.keys())}" - ) - return [] - - # Search across the valid selected folders - raw_results = self.embedding_manager.search( - query=query, - folder_paths=valid_search_paths, - limit=self.max_results, - score_threshold=0.1, # Skip very low relevance results - ) - - if not raw_results: - logger.info(f"No local documents found for query: {query}") - return [] - - # Convert to preview format - previews = [] - for i, result in enumerate(raw_results): - # Create a unique ID - result_id = f"local-{i}-{hashlib.md5(result['content'][:50].encode(), usedforsecurity=False).hexdigest()}" # DevSkim: ignore DS126858 - - # Extract filename and path - source_path = result["metadata"].get("source", "Unknown") - filename = result["metadata"].get( - "filename", Path(source_path).name - ) - - # Create preview snippet (first ~200 chars of content) - snippet = ( - result["content"][:200] + "..." - if len(result["content"]) > 200 - else result["content"] - ) - - # Determine which collection this document belongs to - collection_name = "Unknown" - folder_path = result["folder"] - for name, collection in self.collections.items(): - if any( - folder_path.is_relative_to(path) - for path in collection.get("paths", []) - ): - break - - # Format the preview - preview = { - "id": result_id, - "title": filename, - "snippet": snippet, - "link": source_path, - "similarity": result["similarity"], - "folder": folder_path.as_posix(), - "collection": collection_name, - "collection_description": self.collections.get( - collection_name, {} - ).get("description", ""), - "_full_content": result[ - "content" - ], # Store full content for later - "_metadata": result["metadata"], # Store metadata for later - } - - previews.append(preview) - - logger.info(f"Found {len(previews)} local document matches") - return previews - - def _get_full_content( - self, relevant_items: List[Dict[str, Any]] - ) -> List[Dict[str, Any]]: - """ - Get full content for the relevant documents. - For local search, the full content is already available. - - Args: - relevant_items: List of relevant preview dictionaries - - Returns: - List of result dictionaries with full content - """ - # Check if we should add full content - if ( - hasattr(search_config, "SEARCH_SNIPPETS_ONLY") - and search_config.SEARCH_SNIPPETS_ONLY - ): - logger.info("Snippet-only mode, skipping full content addition") - return relevant_items - - # For local search, we already have the full content - results = [] - for item in relevant_items: - # Create a copy with full content - result = item.copy() - - # Add full content if we have it - if "_full_content" in item: - result["content"] = item["_full_content"] - result["full_content"] = item["_full_content"] - - # Remove temporary fields - if "_full_content" in result: - del result["_full_content"] - - # Add metadata if we have it - if "_metadata" in item: - result["document_metadata"] = item["_metadata"] - - # Remove temporary fields - if "_metadata" in result: - del result["_metadata"] - - results.append(result) - - return results - - def run( - self, - query: str, - research_context: Dict[str, Any] | None = None, - collection_names: Optional[List[str]] = None, - ) -> List[Dict[str, Any]]: - """ - Execute a search using the two-phase approach. - - Args: - query: The search query - research_context: Context from previous research to use. - collection_names: Specific collections to search within (if None, search all) - - Returns: - List of search result dictionaries with full content - """ - logger.info("---Execute a search using Local Documents---") - - # Check if we have any special collection parameters in the query - collection_prefix = "collection:" - remaining_query = query - specified_collections = [] - - # Parse query for collection specifications like "collection:research_papers query terms" - query_parts = query.split() - for part in query_parts: - if part.lower().startswith(collection_prefix): - collection_name = part[len(collection_prefix) :].strip() - if collection_name in self.collections: - specified_collections.append(collection_name) - # Remove this part from the query - remaining_query = remaining_query.replace( - part, "", 1 - ).strip() - - # If collections were specified in the query, they override the parameter - if specified_collections: - collection_names = specified_collections - query = remaining_query - - # Phase 1: Get previews (with collection filtering) - previews = self._get_previews(query, collection_names) - - if not previews: - return [] - - # Phase 2: Filter for relevance - relevant_items = self._filter_for_relevance(previews, query) - - if not relevant_items: - return [] - - # Phase 3: Get full content for relevant items - if ( - hasattr(search_config, "SEARCH_SNIPPETS_ONLY") - and search_config.SEARCH_SNIPPETS_ONLY - ): - logger.info("Returning snippet-only results as per config") - results = relevant_items - else: - results = self._get_full_content(relevant_items) - - # Clean up temporary data - self.embedding_manager.clear_cache() - - return results - - def get_collections_info(self) -> List[Dict[str, Any]]: - """ - Get information about all collections, including indexing status. - - Returns: - List of collection information dictionaries - """ - collections_info = [] - - for name, collection in self.collections.items(): - paths = collection.get("paths", []) - paths = [Path(p) for p in paths] - description = collection.get("description", "") - - # Get indexing information for each path - paths_info = [] - for path in paths: - # Check if folder exists - exists = path.exists() and path.is_dir() - - # Check if folder is indexed - folder_hash = self.embedding_manager.get_folder_hash(path) - indexed = folder_hash in self.embedding_manager.indexed_folders - - # Get index details if available - index_info = {} - if indexed: - index_info = self.embedding_manager.indexed_folders[ - folder_hash - ].copy() - - paths_info.append( - { - "path": path, - "exists": exists, - "indexed": indexed, - "index_info": index_info, - } - ) - - collections_info.append( - { - "name": name, - "description": description, - "paths": paths, - "paths_info": paths_info, - "document_count": sum( - info.get("index_info", {}).get("file_count", 0) - for info in paths_info - ), - "chunk_count": sum( - info.get("index_info", {}).get("chunk_count", 0) - for info in paths_info - ), - "all_indexed": all( - info["indexed"] for info in paths_info if info["exists"] - ), - } - ) - - return collections_info - - def reindex_collection(self, collection_name: str) -> bool: - """ - Reindex a specific collection. - - Args: - collection_name: Name of the collection to reindex - - Returns: - True if reindexing was successful, False otherwise - """ - if collection_name not in self.collections: - logger.error(f"Collection '{collection_name}' not found") - return False - - paths = self.collections[collection_name].get("paths", []) - success = True - - for path in paths: - if not self.embedding_manager.index_folder( - path, force_reindex=True - ): - success = False - - return success - - @classmethod - def from_config( - cls, config_dict: Dict[str, Any], llm: Optional[BaseLLM] = None - ) -> "LocalSearchEngine": - """ - Create a LocalSearchEngine instance from a configuration dictionary. - - Args: - config_dict: Configuration dictionary - llm: Language model for relevance filtering - - Returns: - Initialized LocalSearchEngine instance - """ - # Required parameters - folder_paths = [] - collections = config_dict.get("collections", {}) - - # Extract all folder paths from collections - for collection_config in collections.values(): - if "paths" in collection_config: - folder_paths.extend(collection_config["paths"]) - - # Fall back to folder_paths if no collections defined - if not folder_paths: - folder_paths = config_dict.get("folder_paths", []) - # Create a default collection if using folder_paths - if folder_paths: - collections = { - "default": { - "paths": folder_paths, - "description": "Default collection", - } - } - - # Optional parameters with defaults - max_results = config_dict.get("max_results", 10) - max_filtered_results = config_dict.get("max_filtered_results") - embedding_model = config_dict.get("embedding_model", "all-MiniLM-L6-v2") - embedding_device = config_dict.get("embedding_device", "cpu") - embedding_model_type = config_dict.get( - "embedding_model_type", "sentence_transformers" - ) - ollama_base_url = config_dict.get("ollama_base_url") - force_reindex = config_dict.get("force_reindex", False) - chunk_size = config_dict.get("chunk_size", 1000) - chunk_overlap = config_dict.get("chunk_overlap", 200) - cache_dir = config_dict.get( - "cache_dir" - ) # None uses app's cache directory - - return cls( - paths=folder_paths, - collections=collections, - llm=llm, - max_results=max_results, - max_filtered_results=max_filtered_results, - embedding_model=embedding_model, - embedding_device=embedding_device, - embedding_model_type=embedding_model_type, - ollama_base_url=ollama_base_url, - force_reindex=force_reindex, - chunk_size=chunk_size, - chunk_overlap=chunk_overlap, - cache_dir=cache_dir, - ) diff --git a/src/local_deep_research/web_search_engines/engines/search_engine_local_all.py b/src/local_deep_research/web_search_engines/engines/search_engine_local_all.py deleted file mode 100644 index adec1d59d..000000000 --- a/src/local_deep_research/web_search_engines/engines/search_engine_local_all.py +++ /dev/null @@ -1,170 +0,0 @@ -""" -Search engine that searches across all local collections -""" - -from typing import Any, Dict, List, Optional, cast - -from langchain_core.language_models import BaseLLM -from loguru import logger - -from ..search_engine_base import BaseSearchEngine -from ..search_engine_factory import create_search_engine -from ..search_engines_config import local_search_engines -from .search_engine_local import LocalSearchEngine - - -class LocalAllSearchEngine(BaseSearchEngine): - """ - Search engine that searches across all local document collections. - Acts as a meta search engine specifically for local collections. - """ - - def __init__( - self, - llm: Optional[BaseLLM] = None, - max_results: int = 10, - max_filtered_results: Optional[int] = None, - settings_snapshot: Optional[Dict[str, Any]] = None, - programmatic_mode: bool = False, - **kwargs, - ): - """ - Initialize the local all-collections search engine. - - Args: - llm: Language model for relevance filtering - max_results: Maximum number of search results - max_filtered_results: Maximum results after filtering - settings_snapshot: Settings snapshot for thread context - programmatic_mode: If True, disables database operations and metrics tracking - **kwargs: Additional parameters passed to LocalSearchEngine instances - """ - # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results - super().__init__( - llm=llm, - max_filtered_results=max_filtered_results, - max_results=max_results, - settings_snapshot=settings_snapshot, - programmatic_mode=programmatic_mode, - ) - - # Find all local collection search engines - self.local_engines = {} - try: - for collection_id in local_search_engines(): - # Create a search engine for this collection - try: - engine = create_search_engine( - collection_id, - llm=llm, - max_filtered_results=max_filtered_results, - settings_snapshot=settings_snapshot, - programmatic_mode=programmatic_mode, - ) - engine = cast(LocalSearchEngine, engine) - - if engine: - self.local_engines[collection_id] = { - "engine": engine, - "name": engine.name, - "description": engine.description, - } - except Exception: - logger.exception( - f"Error creating search engine for collection '{collection_id}'" - ) - except ImportError: - logger.warning("No local collections configuration found") - - def _get_previews(self, query: str) -> List[Dict[str, Any]]: - """ - Get preview information for documents from all local collections. - - Args: - query: The search query - - Returns: - List of preview dictionaries - """ - logger.info( - f"Searching across all local collections for query: {query}" - ) - - all_previews = [] - - # Get previews from each local search engine - for collection_id, engine_info in self.local_engines.items(): - engine = engine_info["engine"] - try: - # Get previews from this engine - previews = engine._get_previews(query) - - # Add collection info to each preview - for preview in previews: - preview["collection_id"] = collection_id - preview["collection_name"] = engine_info["name"] - preview["collection_description"] = engine_info[ - "description" - ] - - all_previews.extend(previews) - except Exception: - logger.exception( - f"Error searching collection '{collection_id}'" - ) - - if not all_previews: - logger.info(f"No local documents found for query: {query}") - return [] - - # Sort by similarity score if available - all_previews.sort( - key=lambda x: float(x.get("similarity", 0)), reverse=True - ) - - # Limit to max_results - return all_previews[: self.max_results] - - def _get_full_content( - self, relevant_items: List[Dict[str, Any]] - ) -> List[Dict[str, Any]]: - """ - Get full content for the relevant documents. - Delegates to the appropriate collection's search engine. - - Args: - relevant_items: List of relevant preview dictionaries - - Returns: - List of result dictionaries with full content - """ - # Group items by collection - items_by_collection = {} - for item in relevant_items: - collection_id = item.get("collection_id") - if collection_id and collection_id in self.local_engines: - if collection_id not in items_by_collection: - items_by_collection[collection_id] = [] - items_by_collection[collection_id].append(item) - - # Process each collection's items with its own engine - all_results = [] - for collection_id, items in items_by_collection.items(): - engine = self.local_engines[collection_id]["engine"] - try: - results = engine._get_full_content(items) - all_results.extend(results) - except Exception: - logger.exception( - f"Error getting full content from collection '{collection_id}'" - ) - # Fall back to returning the items without full content - all_results.extend(items) - - # Add any items that weren't processed - processed_ids = set(item["id"] for item in all_results) - for item in relevant_items: - if item["id"] not in processed_ids: - all_results.append(item) - - return all_results diff --git a/src/local_deep_research/web_search_engines/rate_limiting/tracker.py b/src/local_deep_research/web_search_engines/rate_limiting/tracker.py index 1d860e058..c83a417fa 100644 --- a/src/local_deep_research/web_search_engines/rate_limiting/tracker.py +++ b/src/local_deep_research/web_search_engines/rate_limiting/tracker.py @@ -261,7 +261,6 @@ class AdaptiveRateLimitTracker: # First time seeing this engine - start optimistic and learn from real responses # Use engine-specific optimistic defaults only for what we know for sure optimistic_defaults = { - "LocalSearchEngine": 0.0, # No network calls "SearXNGSearchEngine": 0.1, # Self-hosted default engine } diff --git a/src/local_deep_research/web_search_engines/search_engines_config.py b/src/local_deep_research/web_search_engines/search_engines_config.py index 560f3070a..4234495f8 100644 --- a/src/local_deep_research/web_search_engines/search_engines_config.py +++ b/src/local_deep_research/web_search_engines/search_engines_config.py @@ -3,8 +3,7 @@ Configuration file for search engines. Loads search engine definitions from the user's configuration. """ -import json -from typing import Any, Dict, List, Optional +from typing import Any, Dict, Optional from sqlalchemy.orm import Session from loguru import logger @@ -164,66 +163,6 @@ def search_config( if "auto" in search_engines and "meta" not in search_engines: search_engines["meta"] = search_engines["auto"] - # Register local document collections - local_collections_data = ( - _get_setting( - "search.engine.local", - {}, - db_session=db_session, - settings_snapshot=settings_snapshot, - username=username, - ) - or {} - ) - local_collections_data = _extract_per_engine_config(local_collections_data) - - for collection, config in local_collections_data.items(): - if not config.get("enabled", True): - # Search engine is not enabled. Ignore. - logger.info(f"Ignoring disabled local collection '{collection}'.") - continue - - if "paths" in config and isinstance(config["paths"], str): - # This will be saved as a json array. - try: - config["paths"] = json.loads(config["paths"]) - except json.decoder.JSONDecodeError: - logger.exception( - f"Path for local collection '{collection}' is not a valid JSON array: " - f"{config['paths']}" - ) - config["paths"] = [] - - # Create a new dictionary with required search engine fields - engine_config = { - "default_params": config, - "requires_llm": True, - } - engine_config_prefix = f"search.engine.local.{collection}" - engine_config["module_path"] = _get_setting( - f"{engine_config_prefix}.module_path", - ".engines.search_engine_local", - db_session=db_session, - settings_snapshot=settings_snapshot, - username=username, - ) - engine_config["class_name"] = _get_setting( - f"{engine_config_prefix}.class_name", - "LocalSearchEngine", - db_session=db_session, - settings_snapshot=settings_snapshot, - username=username, - ) - - # Copy these specific fields to the top level if they exist - for field in ["strengths", "weaknesses", "reliability", "description"]: - if field in config: - engine_config[field] = config[field] - - search_engines[collection] = engine_config - - logger.info("Registered local document collections as search engines") - # Register Library RAG as a search engine library_enabled = _get_setting( "search.engine.library.enabled", @@ -338,45 +277,3 @@ def default_search_engine( settings_snapshot=settings_snapshot, username=username, ) - - -def local_search_engines( - username: Optional[str] = None, - db_session: Optional[Session] = None, - settings_snapshot: Optional[Dict[str, Any]] = None, -) -> List[str]: - """ - Returns a list of the enabled local search engines. - - Args: - username: Username for backward compatibility (deprecated) - db_session: Database session for direct access (preferred for web routes) - settings_snapshot: Settings snapshot for thread context (preferred for background threads) - - Returns: - A list of the enabled local search engines. - """ - local_collections_data = ( - _get_setting( - "search.engine.local", - {}, - db_session=db_session, - settings_snapshot=settings_snapshot, - username=username, - ) - or {} - ) - local_collections_data = _extract_per_engine_config(local_collections_data) - - # Don't include the `local_all` collection. - local_collections_data.pop("local_all", None) - # Remove disabled collections. - local_collections_data = { - k: v - for k, v in local_collections_data.items() - if v.get("enabled", True) - } - - enabled_collections = list(local_collections_data.keys()) - logger.debug(f"Using local collections: {enabled_collections}") - return enabled_collections diff --git a/tests/rate_limiting/test_rate_limiting.py b/tests/rate_limiting/test_rate_limiting.py index 1c8bafdc6..51826f3fb 100644 --- a/tests/rate_limiting/test_rate_limiting.py +++ b/tests/rate_limiting/test_rate_limiting.py @@ -48,7 +48,6 @@ class TestAdaptiveRateLimitTracker(unittest.TestCase): "TestEngine_GetStats", "TestEngine_Reset", "SearXNGSearchEngine", - "LocalSearchEngine", ] for engine in test_engines: try: @@ -66,7 +65,6 @@ class TestAdaptiveRateLimitTracker(unittest.TestCase): "TestEngine_GetStats", "TestEngine_Reset", "SearXNGSearchEngine", - "LocalSearchEngine", ] for engine in test_engines: try: @@ -84,7 +82,6 @@ class TestAdaptiveRateLimitTracker(unittest.TestCase): try: self.tracker.reset_engine("TestEngine") self.tracker.reset_engine("SearXNGSearchEngine") - self.tracker.reset_engine("LocalSearchEngine") except: pass @@ -99,13 +96,6 @@ class TestAdaptiveRateLimitTracker(unittest.TestCase): searxng_wait = self.tracker.get_wait_time("SearXNGSearchEngine") self.assertEqual(searxng_wait, 0.1) # Very optimistic for self-hosted - # Test Local search (no network) - # Clear from current estimates to force default - if "LocalSearchEngine" in self.tracker.current_estimates: - del self.tracker.current_estimates["LocalSearchEngine"] - local_wait = self.tracker.get_wait_time("LocalSearchEngine") - self.assertEqual(local_wait, 0.0) # No wait for local search - @pytest.mark.timeout(30) def test_record_outcome_and_learning(self): """Test recording outcomes and learning from them. diff --git a/tests/research_library/routes/test_rag_routes.py b/tests/research_library/routes/test_rag_routes.py index caf291367..7d13f856b 100644 --- a/tests/research_library/routes/test_rag_routes.py +++ b/tests/research_library/routes/test_rag_routes.py @@ -2540,7 +2540,7 @@ class TestBackgroundThreadSettingsManagerUsage: mock_rag.return_value = Mock() with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" + "local_deep_research.web_search_engines.engines.local_embedding_manager.LocalEmbeddingManager" ): try: _get_rag_service_for_thread( diff --git a/tests/security/test_absolute_module_paths_hook.py b/tests/security/test_absolute_module_paths_hook.py index 25c24fa10..e29735801 100644 --- a/tests/security/test_absolute_module_paths_hook.py +++ b/tests/security/test_absolute_module_paths_hook.py @@ -63,7 +63,7 @@ x = "local_deep_research.some_new_package.foo.bar" def test_detects_multiple_violations(self): """Should detect multiple violations in one file.""" code = """ -a = "local_deep_research.web_search_engines.engines.search_engine_local" +a = "local_deep_research.web_search_engines.engines.local_embedding_manager" b = "local_deep_research.web_search_engines.engines.search_engine_brave" c = "local_deep_research.llm.providers.implementations.openai_provider" """ diff --git a/tests/settings/golden_master_settings.json b/tests/settings/golden_master_settings.json index f76fa5850..6705f2301 100644 --- a/tests/settings/golden_master_settings.json +++ b/tests/settings/golden_master_settings.json @@ -2169,906 +2169,6 @@ ], "visible": true }, - "search.engine.local.local_all.class_name": { - "category": "local_all", - "description": "Internal: Python class implementing local document search. Do not modify.", - "editable": false, - "max_value": null, - "min_value": null, - "name": "Class Name", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "LocalAllSearchEngine", - "visible": false - }, - "search.engine.local.local_all.description": { - "category": "local_all", - "description": "Human-readable description of the search engine.", - "editable": false, - "max_value": null, - "min_value": null, - "name": "Description", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Search only local documents using RAG.", - "visible": false - }, - "search.engine.local.local_all.display_name": { - "category": "local_all", - "description": "Display name to use in the U.I. for this search engine.", - "editable": false, - "max_value": null, - "min_value": null, - "name": "Display Name", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Local Documents", - "visible": false - }, - "search.engine.local.local_all.module_path": { - "category": "local_all", - "description": "Internal: Python module path for local search implementation. Do not modify.", - "editable": false, - "max_value": null, - "min_value": null, - "name": "Module Path", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": ".engines.search_engine_local_all", - "visible": false - }, - "search.engine.local.local_all.reliability": { - "category": "local_all", - "description": "Reliability score (0-1) for local search. Quality depends on your document collection and indexing.", - "editable": true, - "max_value": 1.0, - "min_value": 0.0, - "name": "Reliability", - "options": null, - "step": 0.05, - "type": "SEARCH", - "ui_element": "range", - "value": 0.85, - "visible": true - }, - "search.engine.local.local_all.requires_api_key": { - "category": "local_all", - "description": "Local document search does not require any external API keys.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Requires Api Key", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": false, - "visible": true - }, - "search.engine.local.local_all.requires_llm": { - "category": "local_all", - "description": "Indicates this engine uses the LLM to rerank and filter results for relevance.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Requires Llm", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": true, - "visible": true - }, - "search.engine.local.local_all.strengths": { - "category": "local_all", - "description": "Advantages: Searches all local document collections at once, works offline, uses your private documents.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Strengths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "searches all local collections", - "personal documents", - "offline access" - ], - "visible": true - }, - "search.engine.local.local_all.use_in_auto_search": { - "category": "local_all", - "description": "Include local documents in auto search mode", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Include in Auto Search", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": true, - "visible": true - }, - "search.engine.local.local_all.weaknesses": { - "category": "local_all", - "description": "Limitations: May return too many results from mixed collections, requires documents to be indexed first.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Weaknesses", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "may return too many results", - "requires indexing" - ], - "visible": true - }, - "search.engine.local.personal_notes.cache_dir": { - "category": "personal_notes", - "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Cache Dir", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": null, - "visible": true - }, - "search.engine.local.personal_notes.chunk_overlap": { - "category": "personal_notes", - "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.", - "editable": true, - "max_value": null, - "min_value": 0, - "name": "Chunk Overlap", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 100, - "visible": true - }, - "search.engine.local.personal_notes.chunk_size": { - "category": "personal_notes", - "description": "Maximum characters per chunk when splitting documents for RAG indexing. Smaller = more precise, larger = more context.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Chunk Size", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 500, - "visible": true - }, - "search.engine.local.personal_notes.description": { - "category": "personal_notes", - "description": "Human-readable description of this document collection shown in the UI.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Description", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Personal notes and documents", - "visible": true - }, - "search.engine.local.personal_notes.embedding_device": { - "category": "personal_notes", - "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Device", - "options": [ - { - "label": "CPU", - "value": "cpu" - }, - { - "label": "CUDA", - "value": "cuda" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "cpu", - "visible": true - }, - "search.engine.local.personal_notes.embedding_model": { - "category": "personal_notes", - "description": "Model for generating text embeddings. Default 'all-MiniLM-L6-v2' is fast and works well; larger models may improve accuracy.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "all-MiniLM-L6-v2", - "visible": true - }, - "search.engine.local.personal_notes.embedding_model_type": { - "category": "personal_notes", - "description": "Model provider to use for generating document embeddings.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model Type", - "options": [ - { - "label": "SentenceTransformers", - "value": "sentence_transformers" - }, - { - "label": "Ollama", - "value": "ollama" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "sentence_transformers", - "visible": true - }, - "search.engine.local.personal_notes.enabled": { - "category": "personal_notes", - "description": "Enable this document collection for searching. Disable if you don't want to index these documents.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Enabled", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": true, - "visible": true - }, - "search.engine.local.personal_notes.max_filtered_results": { - "category": "personal_notes", - "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Max Filtered Results", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": 10, - "visible": true - }, - "search.engine.local.personal_notes.max_results": { - "category": "personal_notes", - "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Max Results", - "options": null, - "step": 1, - "type": "SEARCH", - "ui_element": "number", - "value": 30, - "visible": true - }, - "search.engine.local.personal_notes.name": { - "category": "personal_notes", - "description": "Internal identifier for this collection. Used in logs and configuration.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Name", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Personal Notes", - "visible": true - }, - "search.engine.local.personal_notes.paths": { - "category": "personal_notes", - "description": "File paths to include in this collection. Supports directories (recursively indexed) and individual files.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Paths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "/local_collections/personal_notes" - ], - "visible": true - }, - "search.engine.local.personal_notes.reliability": { - "category": "personal_notes", - "description": "Reliability score (0-1). Personal notes are rated lower (0.75) as they may contain informal or subjective content.", - "editable": true, - "max_value": 1.0, - "min_value": 0.0, - "name": "Reliability", - "options": null, - "step": 0.05, - "type": "SEARCH", - "ui_element": "range", - "value": 0.75, - "visible": true - }, - "search.engine.local.personal_notes.strengths": { - "category": "personal_notes", - "description": "Advantages: Access to your personal knowledge, notes, and private documents not available elsewhere.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Strengths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "personal knowledge", - "notes", - "private documents" - ], - "visible": true - }, - "search.engine.local.personal_notes.use_in_auto_search": { - "category": "personal_notes", - "description": "Include personal notes in auto search mode", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Include in Auto Search", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": false, - "visible": true - }, - "search.engine.local.personal_notes.weaknesses": { - "category": "personal_notes", - "description": "Limitations: Content may be subjective, informal, or incomplete compared to published sources.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Weaknesses", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "subjective content", - "informal information" - ], - "visible": true - }, - "search.engine.local.project_docs.cache_dir": { - "category": "project_docs", - "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Cache Dir", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": null, - "visible": true - }, - "search.engine.local.project_docs.chunk_overlap": { - "category": "project_docs", - "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.", - "editable": true, - "max_value": null, - "min_value": 0, - "name": "Chunk Overlap", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 200, - "visible": true - }, - "search.engine.local.project_docs.chunk_size": { - "category": "project_docs", - "description": "Maximum characters per chunk when splitting documents for RAG indexing. Larger default (1000) suits technical documentation.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Chunk Size", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 1000, - "visible": true - }, - "search.engine.local.project_docs.description": { - "category": "project_docs", - "description": "Human-readable description of this document collection shown in the UI.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Description", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Project documentation and specifications", - "visible": true - }, - "search.engine.local.project_docs.embedding_device": { - "category": "project_docs", - "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Device", - "options": [ - { - "label": "CPU", - "value": "cpu" - }, - { - "label": "CUDA", - "value": "cuda" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "cpu", - "visible": true - }, - "search.engine.local.project_docs.embedding_model": { - "category": "project_docs", - "description": "Model for generating text embeddings. Default 'all-MiniLM-L6-v2' is fast and works well; larger models may improve accuracy.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "all-MiniLM-L6-v2", - "visible": true - }, - "search.engine.local.project_docs.embedding_model_type": { - "category": "project_docs", - "description": "Model provider to use for generating document embeddings.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model Type", - "options": [ - { - "label": "SentenceTransformers", - "value": "sentence_transformers" - }, - { - "label": "Ollama", - "value": "ollama" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "sentence_transformers", - "visible": true - }, - "search.engine.local.project_docs.enabled": { - "category": "project_docs", - "description": "Enable this document collection for searching. Disable if you don't want to index these documents.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Enabled", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": true, - "visible": true - }, - "search.engine.local.project_docs.max_filtered_results": { - "category": "project_docs", - "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Max Filtered Results", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": 5, - "visible": true - }, - "search.engine.local.project_docs.max_results": { - "category": "project_docs", - "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Max Results", - "options": null, - "step": 1, - "type": "SEARCH", - "ui_element": "number", - "value": 20, - "visible": true - }, - "search.engine.local.project_docs.name": { - "category": "project_docs", - "description": "Internal identifier for this collection. Used in logs and configuration.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Name", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Project Documents", - "visible": true - }, - "search.engine.local.project_docs.paths": { - "category": "project_docs", - "description": "File paths to include in this collection. Supports directories (recursively indexed) and individual files.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Paths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "/local_collections/project_docs/" - ], - "visible": true - }, - "search.engine.local.project_docs.reliability": { - "category": "project_docs", - "description": "Reliability score (0-1). Project docs rated moderately (0.8) as they are semi-formal technical content.", - "editable": true, - "max_value": 1.0, - "min_value": 0.0, - "name": "Reliability", - "options": null, - "step": 0.05, - "type": "SEARCH", - "ui_element": "range", - "value": 0.9, - "visible": true - }, - "search.engine.local.project_docs.strengths": { - "category": "project_docs", - "description": "Advantages: Access to project-specific technical docs, READMEs, and internal documentation not available online.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Strengths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "project documentation", - "specifications", - "internal documents" - ], - "visible": true - }, - "search.engine.local.project_docs.use_in_auto_search": { - "category": "project_docs", - "description": "Include project documents in auto search mode", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Include in Auto Search", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": false, - "visible": true - }, - "search.engine.local.project_docs.weaknesses": { - "category": "project_docs", - "description": "Limitations: May be outdated if docs not maintained, limited scope to specific projects.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Weaknesses", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "no external information", - "limited to organizational knowledge" - ], - "visible": true - }, - "search.engine.local.research_papers.cache_dir": { - "category": "research_papers", - "description": "Directory for storing indexed embeddings and cache files. Uses system cache location by default.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Cache Dir", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": null, - "visible": true - }, - "search.engine.local.research_papers.chunk_overlap": { - "category": "research_papers", - "description": "Number of characters to overlap between chunks for context continuity during RAG indexing.", - "editable": true, - "max_value": null, - "min_value": 0, - "name": "Chunk Overlap", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 150, - "visible": true - }, - "search.engine.local.research_papers.chunk_size": { - "category": "research_papers", - "description": "Maximum characters per chunk when splitting papers for RAG indexing. Default (800) balances context and precision.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Chunk Size", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "number", - "value": 800, - "visible": true - }, - "search.engine.local.research_papers.description": { - "category": "research_papers", - "description": "Human-readable description of this document collection shown in the UI.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Description", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Academic research papers and articles", - "visible": true - }, - "search.engine.local.research_papers.embedding_device": { - "category": "research_papers", - "description": "Device for computing embeddings. 'cpu' works everywhere; 'cuda' is faster but requires NVIDIA GPU.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Device", - "options": [ - { - "label": "CPU", - "value": "cpu" - }, - { - "label": "CUDA", - "value": "cuda" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "cpu", - "visible": true - }, - "search.engine.local.research_papers.embedding_model": { - "category": "research_papers", - "description": "Model for generating text embeddings. Consider 'allenai/specter' for academic papers if available.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "all-MiniLM-L6-v2", - "visible": true - }, - "search.engine.local.research_papers.embedding_model_type": { - "category": "research_papers", - "description": "Model provider to use for generating document embeddings.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Embedding Model Type", - "options": [ - { - "label": "SentenceTransformers", - "value": "sentence_transformers" - }, - { - "label": "Ollama", - "value": "ollama" - } - ], - "step": null, - "type": "SEARCH", - "ui_element": "select", - "value": "sentence_transformers", - "visible": true - }, - "search.engine.local.research_papers.enabled": { - "category": "research_papers", - "description": "Enable this document collection for searching. Disable if you don't have local research papers.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Enabled", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": true, - "visible": true - }, - "search.engine.local.research_papers.max_filtered_results": { - "category": "research_papers", - "description": "Maximum results to return after LLM relevance filtering. These are the final results used in research.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Max Filtered Results", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": 5, - "visible": true - }, - "search.engine.local.research_papers.max_results": { - "category": "research_papers", - "description": "Maximum results from initial vector similarity search, before LLM filters for relevance.", - "editable": true, - "max_value": null, - "min_value": 1, - "name": "Max Results", - "options": null, - "step": 1, - "type": "SEARCH", - "ui_element": "number", - "value": 20, - "visible": true - }, - "search.engine.local.research_papers.name": { - "category": "research_papers", - "description": "Internal identifier for this collection. Used in logs and configuration.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Name", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "text", - "value": "Research Papers", - "visible": true - }, - "search.engine.local.research_papers.paths": { - "category": "research_papers", - "description": "File paths containing academic papers. Supports PDFs and text formats; directories are indexed recursively.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Paths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "/local_collections/research_papers/" - ], - "visible": true - }, - "search.engine.local.research_papers.reliability": { - "category": "research_papers", - "description": "Reliability score (0-1). Research papers rated high (0.95) as they are peer-reviewed academic content.", - "editable": true, - "max_value": 1.0, - "min_value": 0.0, - "name": "Reliability", - "options": null, - "step": 0.05, - "type": "SEARCH", - "ui_element": "range", - "value": 0.85, - "visible": true - }, - "search.engine.local.research_papers.strengths": { - "category": "research_papers", - "description": "Advantages: Access to peer-reviewed academic content, scientific papers, and scholarly research in your collection.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Strengths", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "academic research", - "scientific papers", - "scholarly content" - ], - "visible": true - }, - "search.engine.local.research_papers.use_in_auto_search": { - "category": "research_papers", - "description": "Include research papers in auto search mode", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Include in Auto Search", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "checkbox", - "value": false, - "visible": true - }, - "search.engine.local.research_papers.weaknesses": { - "category": "research_papers", - "description": "Limitations: Limited to papers in your collection, may be outdated if not regularly updated.", - "editable": true, - "max_value": null, - "min_value": null, - "name": "Weaknesses", - "options": null, - "step": null, - "type": "SEARCH", - "ui_element": "json", - "value": [ - "potentially outdated", - "limited to collected papers" - ], - "visible": true - }, "search.engine.web.arxiv.class_name": { "category": "arxiv", "description": "Internal: Python class implementing the arXiv search engine.", diff --git a/tests/web_search_engines/engines/test_local_embedding_manager.py b/tests/web_search_engines/engines/test_local_embedding_manager.py new file mode 100644 index 000000000..0114e36c7 --- /dev/null +++ b/tests/web_search_engines/engines/test_local_embedding_manager.py @@ -0,0 +1,96 @@ +""" +Tests for the LocalEmbeddingManager class. + +Tests cover: +- LocalEmbeddingManager initialization and configuration +- Embeddings lazy initialization +""" + +from unittest.mock import Mock, patch + + +class TestLocalEmbeddingManagerInit: + """Tests for LocalEmbeddingManager initialization.""" + + def test_init_with_defaults(self): + """Initialize with default values.""" + from local_deep_research.web_search_engines.engines.local_embedding_manager import ( + LocalEmbeddingManager, + ) + + manager = LocalEmbeddingManager() + + assert manager.embedding_model == "all-MiniLM-L6-v2" + assert manager.embedding_device == "cpu" + assert manager.embedding_model_type == "sentence_transformers" + assert manager._embeddings is None # Lazy initialization + + def test_init_with_ollama(self): + """Initialize with Ollama embeddings.""" + from local_deep_research.web_search_engines.engines.local_embedding_manager import ( + LocalEmbeddingManager, + ) + + manager = LocalEmbeddingManager( + embedding_model_type="ollama", + embedding_model="llama2", + ollama_base_url="http://localhost:11434", + ) + + assert manager.embedding_model_type == "ollama" + assert manager.embedding_model == "llama2" + assert manager.ollama_base_url == "http://localhost:11434" + + def test_init_with_settings_snapshot(self): + """Initialize with settings snapshot.""" + from local_deep_research.web_search_engines.engines.local_embedding_manager import ( + LocalEmbeddingManager, + ) + + settings = {"_username": "testuser"} + manager = LocalEmbeddingManager(settings_snapshot=settings) + + assert manager.username == "testuser" + assert manager.settings_snapshot == settings + + +class TestLocalEmbeddingManagerEmbeddings: + """Tests for LocalEmbeddingManager embeddings property.""" + + def test_embeddings_lazy_initialization(self): + """Embeddings are lazily initialized.""" + from local_deep_research.web_search_engines.engines.local_embedding_manager import ( + LocalEmbeddingManager, + ) + + manager = LocalEmbeddingManager() + + assert manager._embeddings is None + + # Mock the embeddings initialization + mock_embeddings = Mock() + with patch.object( + manager, "_initialize_embeddings", return_value=mock_embeddings + ): + embeddings = manager.embeddings + + assert embeddings is mock_embeddings + assert manager._embeddings is mock_embeddings + + def test_embeddings_reuse(self): + """Embeddings are reused after initialization.""" + from local_deep_research.web_search_engines.engines.local_embedding_manager import ( + LocalEmbeddingManager, + ) + + manager = LocalEmbeddingManager() + + mock_embeddings = Mock() + manager._embeddings = mock_embeddings + + # Should return existing embeddings without reinitializing + with patch.object(manager, "_initialize_embeddings") as mock_init: + embeddings = manager.embeddings + + assert embeddings is mock_embeddings + mock_init.assert_not_called() diff --git a/tests/web_search_engines/engines/test_search_engine_local.py b/tests/web_search_engines/engines/test_search_engine_local.py deleted file mode 100644 index b16ee01e7..000000000 --- a/tests/web_search_engines/engines/test_search_engine_local.py +++ /dev/null @@ -1,1073 +0,0 @@ -""" -Tests for the LocalSearchEngine and LocalEmbeddingManager classes. - -Tests cover: -- Helper functions (_get_file_loader, _load_document) -- LocalEmbeddingManager initialization and methods -- LocalSearchEngine initialization and methods -- Search functionality -- Folder indexing -""" - -import json -from pathlib import Path -from unittest.mock import Mock, patch - - -class TestGetFileLoader: - """Tests for _get_file_loader helper function.""" - - def test_get_file_loader_pdf(self, tmp_path): - """Get file loader for PDF files.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _get_file_loader, - ) - - pdf_file = tmp_path / "test.pdf" - pdf_file.touch() - - mock_loader = Mock() - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path", - return_value=mock_loader, - ) as mock_get_loader: - result = _get_file_loader(str(pdf_file)) - mock_get_loader.assert_called_once_with(str(pdf_file)) - assert result is mock_loader - - def test_get_file_loader_txt(self, tmp_path): - """Get file loader for text files.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _get_file_loader, - ) - - txt_file = tmp_path / "test.txt" - txt_file.touch() - - mock_loader = Mock() - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path", - return_value=mock_loader, - ) as mock_get_loader: - result = _get_file_loader(str(txt_file)) - mock_get_loader.assert_called_once_with(str(txt_file)) - assert result is mock_loader - - def test_get_file_loader_markdown(self, tmp_path): - """Get file loader for markdown files.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _get_file_loader, - ) - - md_file = tmp_path / "test.md" - md_file.touch() - - mock_loader = Mock() - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path", - return_value=mock_loader, - ) as mock_get_loader: - result = _get_file_loader(str(md_file)) - mock_get_loader.assert_called_once_with(str(md_file)) - assert result is mock_loader - - def test_get_file_loader_docx(self, tmp_path): - """Get file loader for Word documents.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _get_file_loader, - ) - - docx_file = tmp_path / "test.docx" - docx_file.touch() - - mock_loader = Mock() - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path", - return_value=mock_loader, - ) as mock_get_loader: - result = _get_file_loader(str(docx_file)) - mock_get_loader.assert_called_once_with(str(docx_file)) - assert result is mock_loader - - def test_get_file_loader_csv(self, tmp_path): - """Get file loader for CSV files.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _get_file_loader, - ) - - csv_file = tmp_path / "test.csv" - csv_file.touch() - - mock_loader = Mock() - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path", - return_value=mock_loader, - ) as mock_get_loader: - result = _get_file_loader(str(csv_file)) - mock_get_loader.assert_called_once_with(str(csv_file)) - assert result is mock_loader - - def test_get_file_loader_xlsx(self, tmp_path): - """Get file loader for Excel files.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _get_file_loader, - ) - - xlsx_file = tmp_path / "test.xlsx" - xlsx_file.touch() - - mock_loader = Mock() - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path", - return_value=mock_loader, - ) as mock_get_loader: - result = _get_file_loader(str(xlsx_file)) - mock_get_loader.assert_called_once_with(str(xlsx_file)) - assert result is mock_loader - - def test_get_file_loader_unknown_extension(self, tmp_path): - """Get file loader for unknown extension falls back to TextLoader.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _get_file_loader, - ) - - unknown_file = tmp_path / "test.xyz" - unknown_file.touch() - - # When extension is not supported, get_loader_for_path returns None - # and _get_file_loader falls back to TextLoader - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.is_extension_supported", - return_value=False, - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.TextLoader" - ) as mock_text_loader: - _get_file_loader(str(unknown_file)) - mock_text_loader.assert_called_once_with( - str(unknown_file), - encoding="utf-8", - autodetect_encoding=True, - ) - - def test_get_file_loader_exception(self, tmp_path): - """Get file loader handles exceptions.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _get_file_loader, - ) - - pdf_file = tmp_path / "test.pdf" - pdf_file.touch() - - # When get_loader_for_path raises an exception, it should return None - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.is_extension_supported", - return_value=True, - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_loader_for_path", - return_value=None, - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.TextLoader", - side_effect=Exception("Loader error"), - ): - loader = _get_file_loader(str(pdf_file)) - assert loader is None - - -class TestLoadDocument: - """Tests for _load_document helper function.""" - - def test_load_document_success(self, tmp_path): - """Load document successfully.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _load_document, - ) - from langchain_core.documents import Document - - txt_file = tmp_path / "test.txt" - txt_file.write_text("Test content") - - mock_doc = Document(page_content="Test content", metadata={}) - mock_loader = Mock() - mock_loader.load.return_value = [mock_doc] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local._get_file_loader", - return_value=mock_loader, - ): - docs = _load_document(txt_file) - - assert len(docs) == 1 - assert docs[0].page_content == "Test content" - assert docs[0].metadata["source"] == str(txt_file) - assert docs[0].metadata["filename"] == "test.txt" - - def test_load_document_no_loader(self, tmp_path): - """Load document with no available loader.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _load_document, - ) - - file_path = tmp_path / "test.xyz" - file_path.touch() - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local._get_file_loader", - return_value=None, - ): - docs = _load_document(file_path) - assert docs == [] - - def test_load_document_exception(self, tmp_path): - """Load document handles exceptions.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - _load_document, - ) - - txt_file = tmp_path / "test.txt" - txt_file.touch() - - mock_loader = Mock() - mock_loader.load.side_effect = Exception("Load error") - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local._get_file_loader", - return_value=mock_loader, - ): - docs = _load_document(txt_file) - assert docs == [] - - -class TestLocalEmbeddingManagerInit: - """Tests for LocalEmbeddingManager initialization.""" - - def test_init_with_defaults(self, tmp_path): - """Initialize with default values.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_cache_directory", - return_value=tmp_path, - ): - manager = LocalEmbeddingManager() - - assert manager.embedding_model == "all-MiniLM-L6-v2" - assert manager.embedding_device == "cpu" - assert manager.embedding_model_type == "sentence_transformers" - assert manager.chunk_size == 1000 - assert manager.chunk_overlap == 200 - assert manager._embeddings is None # Lazy initialization - - def test_init_with_custom_cache_dir(self, tmp_path): - """Initialize with custom cache directory.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - custom_cache = tmp_path / "custom_cache" - manager = LocalEmbeddingManager(cache_dir=str(custom_cache)) - - assert manager.cache_dir == custom_cache - assert custom_cache.exists() - - def test_init_with_ollama(self, tmp_path): - """Initialize with Ollama embeddings.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager( - embedding_model_type="ollama", - embedding_model="llama2", - ollama_base_url="http://localhost:11434", - cache_dir=str(tmp_path), - ) - - assert manager.embedding_model_type == "ollama" - assert manager.embedding_model == "llama2" - assert manager.ollama_base_url == "http://localhost:11434" - - def test_init_with_settings_snapshot(self, tmp_path): - """Initialize with settings snapshot.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - settings = {"_username": "testuser"} - manager = LocalEmbeddingManager( - settings_snapshot=settings, cache_dir=str(tmp_path) - ) - - assert manager.username == "testuser" - assert manager.settings_snapshot == settings - - -class TestLocalEmbeddingManagerEmbeddings: - """Tests for LocalEmbeddingManager embeddings property.""" - - def test_embeddings_lazy_initialization(self, tmp_path): - """Embeddings are lazily initialized.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - - assert manager._embeddings is None - - # Mock the embeddings initialization - mock_embeddings = Mock() - with patch.object( - manager, "_initialize_embeddings", return_value=mock_embeddings - ): - embeddings = manager.embeddings - - assert embeddings is mock_embeddings - assert manager._embeddings is mock_embeddings - - def test_embeddings_reuse(self, tmp_path): - """Embeddings are reused after initialization.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - - mock_embeddings = Mock() - manager._embeddings = mock_embeddings - - # Should return existing embeddings without reinitializing - with patch.object(manager, "_initialize_embeddings") as mock_init: - embeddings = manager.embeddings - - assert embeddings is mock_embeddings - mock_init.assert_not_called() - - -class TestLocalEmbeddingManagerIndexedFolders: - """Tests for LocalEmbeddingManager indexed folders management.""" - - def test_load_indexed_folders_empty(self, tmp_path): - """Load indexed folders when no metadata exists.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - - assert manager.indexed_folders == {} - - def test_load_indexed_folders_from_disk(self, tmp_path): - """Load indexed folders from disk.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - # Create metadata file - metadata = { - "abc123": { - "path": "/test/folder", - "last_indexed": 1234567890, - "file_count": 10, - } - } - metadata_file = tmp_path / "index_metadata.json" - metadata_file.write_text(json.dumps(metadata)) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - - assert "abc123" in manager.indexed_folders - assert manager.indexed_folders["abc123"]["path"] == "/test/folder" - - def test_save_indexed_folders(self, tmp_path): - """Save indexed folders to disk.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - manager.indexed_folders = { - "xyz789": { - "path": "/another/folder", - "last_indexed": 9876543210, - } - } - - manager._save_indexed_folders() - - metadata_file = tmp_path / "index_metadata.json" - assert metadata_file.exists() - - saved_data = json.loads(metadata_file.read_text()) - assert "xyz789" in saved_data - - -class TestLocalEmbeddingManagerFolderHash: - """Tests for LocalEmbeddingManager folder hash methods.""" - - def test_get_folder_hash(self, tmp_path): - """Get folder hash is deterministic.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - folder_path = tmp_path / "test_folder" - folder_path.mkdir() - - hash1 = LocalEmbeddingManager.get_folder_hash(folder_path) - hash2 = LocalEmbeddingManager.get_folder_hash(folder_path) - - assert hash1 == hash2 - assert len(hash1) == 32 # MD5 hash length - - def test_get_folder_hash_different_folders(self, tmp_path): - """Different folders have different hashes.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - folder1 = tmp_path / "folder1" - folder2 = tmp_path / "folder2" - folder1.mkdir() - folder2.mkdir() - - hash1 = LocalEmbeddingManager.get_folder_hash(folder1) - hash2 = LocalEmbeddingManager.get_folder_hash(folder2) - - assert hash1 != hash2 - - def test_get_index_path(self, tmp_path): - """Get index path for a folder.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - - folder_path = Path("/test/folder") - index_path = manager._get_index_path(folder_path) - - assert "index_" in str(index_path) - assert index_path.parent == tmp_path - - -class TestLocalEmbeddingManagerGetAllFiles: - """Tests for LocalEmbeddingManager _get_all_files method.""" - - def test_get_all_files(self, tmp_path): - """Get all files in a folder recursively.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - # Create test folder structure - (tmp_path / "file1.txt").touch() - (tmp_path / "file2.txt").touch() - subdir = tmp_path / "subdir" - subdir.mkdir() - (subdir / "file3.txt").touch() - - files = list(LocalEmbeddingManager._get_all_files(tmp_path)) - - assert len(files) == 3 - filenames = [f.name for f in files] - assert "file1.txt" in filenames - assert "file2.txt" in filenames - assert "file3.txt" in filenames - - -class TestLocalEmbeddingManagerCheckConfigChanged: - """Tests for LocalEmbeddingManager config change detection.""" - - def test_check_config_changed_new_folder(self, tmp_path): - """Config changed is True for new folder.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - - folder = tmp_path / "new_folder" - folder.mkdir() - - assert manager._check_config_changed(folder) is True - - def test_check_config_changed_same_config(self, tmp_path): - """Config changed is False when config is the same.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - - folder = tmp_path / "test_folder" - folder.mkdir() - - folder_hash = manager.get_folder_hash(folder) - manager.indexed_folders[folder_hash] = { - "chunk_size": 1000, - "chunk_overlap": 200, - "embedding_model": "all-MiniLM-L6-v2", - } - - assert manager._check_config_changed(folder) is False - - def test_check_config_changed_different_chunk_size(self, tmp_path): - """Config changed is True when chunk size differs.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - - folder = tmp_path / "test_folder" - folder.mkdir() - - folder_hash = manager.get_folder_hash(folder) - manager.indexed_folders[folder_hash] = { - "chunk_size": 500, # Different from default 1000 - "chunk_overlap": 200, - "embedding_model": "all-MiniLM-L6-v2", - } - - assert manager._check_config_changed(folder) is True - - -class TestLocalEmbeddingManagerClearCache: - """Tests for LocalEmbeddingManager clear_cache method.""" - - def test_clear_cache(self, tmp_path): - """Clear cache removes vector stores from memory.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - manager.vector_stores = {"hash1": Mock(), "hash2": Mock()} - - manager.clear_cache() - - assert manager.vector_stores == {} - - -class TestLocalEmbeddingManagerGetIndexedFoldersInfo: - """Tests for LocalEmbeddingManager get_indexed_folders_info method.""" - - def test_get_indexed_folders_info_empty(self, tmp_path): - """Get indexed folders info when no folders indexed.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - - info = manager.get_indexed_folders_info() - - assert info == [] - - def test_get_indexed_folders_info_with_folders(self, tmp_path): - """Get indexed folders info with indexed folders.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager(cache_dir=str(tmp_path)) - - test_folder = tmp_path / "test_folder" - test_folder.mkdir() - - folder_hash = manager.get_folder_hash(test_folder) - manager.indexed_folders[folder_hash] = { - "path": str(test_folder), - "last_indexed": 1234567890, - "file_count": 5, - "chunk_count": 20, - } - - info = manager.get_indexed_folders_info() - - assert len(info) == 1 - assert info[0]["path"] == str(test_folder) - assert info[0]["file_count"] == 5 - assert "last_indexed_formatted" in info[0] - - -class TestLocalSearchEngineInit: - """Tests for LocalSearchEngine initialization.""" - - def test_init_with_valid_paths(self, tmp_path): - """Initialize with valid folder paths.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - # Mock embedding manager to avoid actual initialization - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - - engine = LocalSearchEngine( - paths=[str(folder)], - name="Test Collection", - description="Test description", - ) - - assert str(folder) in engine.valid_folder_paths - assert engine.name == "Test Collection" - assert engine.description == "Test description" - - def test_init_with_invalid_paths(self, tmp_path): - """Initialize with invalid folder paths.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - invalid_path = str(tmp_path / "nonexistent") - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ): - engine = LocalSearchEngine(paths=[invalid_path]) - - assert invalid_path not in engine.valid_folder_paths - assert engine.valid_folder_paths == [] - - def test_init_with_custom_max_results(self, tmp_path): - """Initialize with custom max_results.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - - engine = LocalSearchEngine(paths=[str(folder)], max_results=50) - - assert engine.max_results == 50 - - def test_init_with_collections(self, tmp_path): - """Initialize with named collections.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder1 = tmp_path / "research" - folder2 = tmp_path / "notes" - folder1.mkdir() - folder2.mkdir() - - collections = { - "research": { - "paths": [str(folder1)], - "description": "Research papers", - }, - "notes": {"paths": [str(folder2)], "description": "Personal notes"}, - } - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - - engine = LocalSearchEngine( - paths=[str(folder1), str(folder2)], collections=collections - ) - - assert "research" in engine.collections - assert "notes" in engine.collections - - -class TestLocalSearchEngineGetPreviews: - """Tests for LocalSearchEngine _get_previews method.""" - - def test_get_previews_returns_results(self, tmp_path): - """Get previews returns formatted results.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - mock_results = [ - { - "content": "Test content for document one", - "metadata": { - "source": str(folder / "doc1.txt"), - "filename": "doc1.txt", - }, - "similarity": 0.95, - "folder": folder, - } - ] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - mock_manager.return_value.search.return_value = mock_results - - engine = LocalSearchEngine(paths=[str(folder)]) - previews = engine._get_previews("test query") - - assert len(previews) == 1 - assert previews[0]["title"] == "doc1.txt" - assert previews[0]["similarity"] == 0.95 - - def test_get_previews_empty_results(self, tmp_path): - """Get previews handles empty results.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - mock_manager.return_value.search.return_value = [] - - engine = LocalSearchEngine(paths=[str(folder)]) - previews = engine._get_previews("test query") - - assert previews == [] - - def test_get_previews_no_valid_folders(self, tmp_path): - """Get previews returns empty for no valid folders.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ): - engine = LocalSearchEngine(paths=["/nonexistent/path"]) - previews = engine._get_previews("test query") - - assert previews == [] - - -class TestLocalSearchEngineGetFullContent: - """Tests for LocalSearchEngine _get_full_content method.""" - - def test_get_full_content_preserves_content(self, tmp_path): - """Get full content preserves full content from items.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - items = [ - { - "id": "local-1", - "title": "Doc 1", - "_full_content": "This is the full content of document 1", - "_metadata": {"source": "/path/to/doc1.txt"}, - } - ] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.search_config" - ) as mock_config: - mock_config.SEARCH_SNIPPETS_ONLY = False - - engine = LocalSearchEngine(paths=[str(folder)]) - results = engine._get_full_content(items) - - assert len(results) == 1 - assert ( - results[0]["full_content"] - == "This is the full content of document 1" - ) - assert "_full_content" not in results[0] - - def test_get_full_content_snippets_only(self, tmp_path): - """Get full content respects snippets-only mode.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - items = [ - { - "id": "local-1", - "title": "Doc 1", - "_full_content": "Full content", - } - ] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.search_config" - ) as mock_config: - mock_config.SEARCH_SNIPPETS_ONLY = True - - engine = LocalSearchEngine(paths=[str(folder)]) - results = engine._get_full_content(items) - - # In snippets-only mode, items are returned as-is - assert results == items - - -class TestLocalSearchEngineRun: - """Tests for LocalSearchEngine run method.""" - - def test_run_returns_results(self, tmp_path): - """Run returns search results.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - mock_results = [ - { - "content": "Test content", - "metadata": { - "source": str(folder / "doc.txt"), - "filename": "doc.txt", - }, - "similarity": 0.9, - "folder": folder, - } - ] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - mock_manager.return_value.search.return_value = mock_results - mock_manager.return_value.clear_cache = Mock() - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.search_config" - ) as mock_config: - mock_config.SEARCH_SNIPPETS_ONLY = False - - engine = LocalSearchEngine(paths=[str(folder)]) - - # Mock _filter_for_relevance to return all items - with patch.object( - engine, "_filter_for_relevance", side_effect=lambda x, q: x - ): - results = engine.run("test query") - - assert len(results) >= 1 - - def test_run_with_collection_filter(self, tmp_path): - """Run with collection filter in query.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - collections = { - "research": { - "paths": [str(folder)], - "description": "Research papers", - }, - } - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - mock_manager.return_value.search.return_value = [] - mock_manager.return_value.clear_cache = Mock() - - engine = LocalSearchEngine( - paths=[str(folder)], collections=collections - ) - results = engine.run("collection:research test query") - - # Should parse collection from query - assert results == [] - - def test_run_empty_previews(self, tmp_path): - """Run returns empty when no previews found.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - mock_manager.return_value.search.return_value = [] - mock_manager.return_value.clear_cache = Mock() - - engine = LocalSearchEngine(paths=[str(folder)]) - results = engine.run("test query") - - assert results == [] - - -class TestLocalSearchEngineGetCollectionsInfo: - """Tests for LocalSearchEngine get_collections_info method.""" - - def test_get_collections_info(self, tmp_path): - """Get collections info returns collection details.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - collections = { - "docs": {"paths": [str(folder)], "description": "Documents"}, - } - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - mock_manager.return_value.indexed_folders = {} - mock_manager.return_value.get_folder_hash.return_value = "abc123" - - engine = LocalSearchEngine( - paths=[str(folder)], collections=collections - ) - info = engine.get_collections_info() - - assert len(info) == 1 - assert info[0]["name"] == "docs" - assert info[0]["description"] == "Documents" - - -class TestLocalSearchEngineReindexCollection: - """Tests for LocalSearchEngine reindex_collection method.""" - - def test_reindex_collection_success(self, tmp_path): - """Reindex collection successfully.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - collections = { - "docs": {"paths": [str(folder)], "description": "Documents"}, - } - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - - engine = LocalSearchEngine( - paths=[str(folder)], collections=collections - ) - result = engine.reindex_collection("docs") - - assert result is True - - def test_reindex_collection_not_found(self, tmp_path): - """Reindex collection that doesn't exist.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - - engine = LocalSearchEngine(paths=[str(folder)]) - result = engine.reindex_collection("nonexistent") - - assert result is False - - -class TestLocalSearchEngineFromConfig: - """Tests for LocalSearchEngine from_config class method.""" - - def test_from_config_with_collections(self, tmp_path): - """Create from config with collections.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - config = { - "collections": { - "docs": {"paths": [str(folder)], "description": "Documents"}, - }, - "max_results": 20, - "embedding_model": "custom-model", - "chunk_size": 500, - } - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - - engine = LocalSearchEngine.from_config(config) - - assert engine.max_results == 20 - assert "docs" in engine.collections - - def test_from_config_with_folder_paths(self, tmp_path): - """Create from config with folder_paths fallback.""" - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - folder = tmp_path / "documents" - folder.mkdir() - - config = { - "folder_paths": [str(folder)], - "max_results": 15, - } - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager" - ) as mock_manager: - mock_manager.return_value.index_folder.return_value = True - - engine = LocalSearchEngine.from_config(config) - - assert engine.max_results == 15 - assert "default" in engine.collections diff --git a/tests/web_search_engines/engines/test_search_engine_local_all.py b/tests/web_search_engines/engines/test_search_engine_local_all.py deleted file mode 100644 index 27ce69258..000000000 --- a/tests/web_search_engines/engines/test_search_engine_local_all.py +++ /dev/null @@ -1,463 +0,0 @@ -""" -Tests for the LocalAllSearchEngine class. - -Tests cover: -- Initialization and configuration -- Local engine discovery -- Search across all collections -- Preview aggregation -- Full content retrieval -""" - -from unittest.mock import Mock, patch - - -class TestLocalAllSearchEngineInit: - """Tests for LocalAllSearchEngine initialization.""" - - def test_init_with_defaults(self): - """Initialize with default values.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=[], - ): - engine = LocalAllSearchEngine() - - assert engine.max_results == 10 - assert engine.local_engines == {} - - def test_init_with_custom_max_results(self): - """Initialize with custom max_results.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=[], - ): - engine = LocalAllSearchEngine(max_results=25) - - assert engine.max_results == 25 - - def test_init_discovers_local_engines(self): - """Initialize and discover local collection engines.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - mock_engine = Mock() - mock_engine.name = "Test Collection" - mock_engine.description = "Test description" - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=["collection1", "collection2"], - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine", - return_value=mock_engine, - ): - engine = LocalAllSearchEngine() - - assert "collection1" in engine.local_engines - assert "collection2" in engine.local_engines - - def test_init_handles_engine_creation_failure(self): - """Initialize handles engine creation failure gracefully.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=["collection1"], - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine", - side_effect=Exception("Engine creation failed"), - ): - engine = LocalAllSearchEngine() - - assert engine.local_engines == {} - - def test_init_handles_import_error(self): - """Initialize handles ImportError for local_search_engines.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - side_effect=ImportError("No config found"), - ): - engine = LocalAllSearchEngine() - - assert engine.local_engines == {} - - def test_init_with_llm(self): - """Initialize with LLM.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - mock_llm = Mock() - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=[], - ): - engine = LocalAllSearchEngine(llm=mock_llm) - - assert engine.llm is mock_llm - - def test_init_with_settings_snapshot(self): - """Initialize with settings snapshot.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - settings = {"_username": "testuser"} - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=[], - ): - engine = LocalAllSearchEngine(settings_snapshot=settings) - - assert engine.settings_snapshot == settings - - def test_init_with_programmatic_mode(self): - """Initialize with programmatic mode.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=[], - ): - engine = LocalAllSearchEngine(programmatic_mode=True) - - assert engine.programmatic_mode is True - - -class TestGetPreviews: - """Tests for _get_previews method.""" - - def test_get_previews_returns_aggregated_results(self): - """Get previews returns results from all local engines.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - mock_engine1 = Mock() - mock_engine1.name = "Collection 1" - mock_engine1.description = "Description 1" - mock_engine1._get_previews.return_value = [ - {"id": "1", "snippet": "Result 1", "similarity": 0.9} - ] - - mock_engine2 = Mock() - mock_engine2.name = "Collection 2" - mock_engine2.description = "Description 2" - mock_engine2._get_previews.return_value = [ - {"id": "2", "snippet": "Result 2", "similarity": 0.8} - ] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=["col1", "col2"], - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine" - ) as mock_create: - mock_create.side_effect = [mock_engine1, mock_engine2] - - engine = LocalAllSearchEngine() - previews = engine._get_previews("test query") - - assert len(previews) == 2 - assert previews[0]["collection_id"] == "col1" - assert previews[1]["collection_id"] == "col2" - - def test_get_previews_sorts_by_similarity(self): - """Get previews sorts results by similarity.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - mock_engine1 = Mock() - mock_engine1.name = "Collection 1" - mock_engine1.description = "Description 1" - mock_engine1._get_previews.return_value = [ - {"id": "1", "snippet": "Result 1", "similarity": 0.5} - ] - - mock_engine2 = Mock() - mock_engine2.name = "Collection 2" - mock_engine2.description = "Description 2" - mock_engine2._get_previews.return_value = [ - {"id": "2", "snippet": "Result 2", "similarity": 0.9} - ] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=["col1", "col2"], - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine" - ) as mock_create: - mock_create.side_effect = [mock_engine1, mock_engine2] - - engine = LocalAllSearchEngine() - previews = engine._get_previews("test query") - - # Higher similarity should come first - assert previews[0]["similarity"] == 0.9 - assert previews[1]["similarity"] == 0.5 - - def test_get_previews_limits_results(self): - """Get previews limits results to max_results.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - mock_engine = Mock() - mock_engine.name = "Collection" - mock_engine.description = "Description" - mock_engine._get_previews.return_value = [ - { - "id": str(i), - "snippet": f"Result {i}", - "similarity": 0.9 - i * 0.1, - } - for i in range(10) - ] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=["col1"], - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine", - return_value=mock_engine, - ): - engine = LocalAllSearchEngine(max_results=5) - previews = engine._get_previews("test query") - - assert len(previews) == 5 - - def test_get_previews_empty_results(self): - """Get previews handles no local engines.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=[], - ): - engine = LocalAllSearchEngine() - previews = engine._get_previews("test query") - - assert previews == [] - - def test_get_previews_handles_engine_error(self): - """Get previews handles engine search error gracefully.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - mock_engine = Mock() - mock_engine.name = "Collection" - mock_engine.description = "Description" - mock_engine._get_previews.side_effect = Exception("Search error") - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=["col1"], - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine", - return_value=mock_engine, - ): - engine = LocalAllSearchEngine() - previews = engine._get_previews("test query") - - assert previews == [] - - def test_get_previews_adds_collection_info(self): - """Get previews adds collection info to each preview.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - mock_engine = Mock() - mock_engine.name = "My Collection" - mock_engine.description = "Collection description" - mock_engine._get_previews.return_value = [ - {"id": "1", "snippet": "Result"} - ] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=["my_collection"], - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine", - return_value=mock_engine, - ): - engine = LocalAllSearchEngine() - previews = engine._get_previews("test query") - - assert previews[0]["collection_id"] == "my_collection" - assert previews[0]["collection_name"] == "My Collection" - assert ( - previews[0]["collection_description"] - == "Collection description" - ) - - -class TestGetFullContent: - """Tests for _get_full_content method.""" - - def test_get_full_content_delegates_to_engines(self): - """Get full content delegates to appropriate collection engines.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - mock_engine = Mock() - mock_engine.name = "Collection" - mock_engine.description = "Description" - mock_engine._get_full_content.return_value = [ - {"id": "1", "full_content": "Full content 1"} - ] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=["col1"], - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine", - return_value=mock_engine, - ): - engine = LocalAllSearchEngine() - - items = [{"id": "1", "collection_id": "col1"}] - results = engine._get_full_content(items) - - assert len(results) == 1 - assert results[0]["full_content"] == "Full content 1" - - def test_get_full_content_groups_by_collection(self): - """Get full content groups items by collection.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - mock_engine1 = Mock() - mock_engine1.name = "Collection 1" - mock_engine1.description = "Description 1" - mock_engine1._get_full_content.return_value = [ - {"id": "1", "full_content": "Content 1"} - ] - - mock_engine2 = Mock() - mock_engine2.name = "Collection 2" - mock_engine2.description = "Description 2" - mock_engine2._get_full_content.return_value = [ - {"id": "2", "full_content": "Content 2"} - ] - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=["col1", "col2"], - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine" - ) as mock_create: - mock_create.side_effect = [mock_engine1, mock_engine2] - - engine = LocalAllSearchEngine() - - items = [ - {"id": "1", "collection_id": "col1"}, - {"id": "2", "collection_id": "col2"}, - ] - results = engine._get_full_content(items) - - assert len(results) == 2 - - def test_get_full_content_handles_engine_error(self): - """Get full content handles engine error gracefully.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - mock_engine = Mock() - mock_engine.name = "Collection" - mock_engine.description = "Description" - mock_engine._get_full_content.side_effect = Exception("Content error") - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=["col1"], - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.create_search_engine", - return_value=mock_engine, - ): - engine = LocalAllSearchEngine() - - items = [ - {"id": "1", "collection_id": "col1", "snippet": "Preview"} - ] - results = engine._get_full_content(items) - - # Should return original items on error - assert len(results) == 1 - assert results[0]["id"] == "1" - - def test_get_full_content_handles_unknown_collection(self): - """Get full content handles items with unknown collection.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=[], - ): - engine = LocalAllSearchEngine() - - items = [{"id": "1", "collection_id": "unknown_collection"}] - results = engine._get_full_content(items) - - # Should return the unprocessed item - assert len(results) == 1 - assert results[0]["id"] == "1" - - def test_get_full_content_handles_missing_collection_id(self): - """Get full content handles items without collection_id.""" - from local_deep_research.web_search_engines.engines.search_engine_local_all import ( - LocalAllSearchEngine, - ) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local_all.local_search_engines", - return_value=[], - ): - engine = LocalAllSearchEngine() - - items = [{"id": "1", "snippet": "No collection ID"}] - results = engine._get_full_content(items) - - # Should return unprocessed item - assert len(results) == 1 - assert results[0]["id"] == "1" diff --git a/tests/web_search_engines/rate_limiting/test_tracker.py b/tests/web_search_engines/rate_limiting/test_tracker.py index 1bd442dcd..9b238837b 100644 --- a/tests/web_search_engines/rate_limiting/test_tracker.py +++ b/tests/web_search_engines/rate_limiting/test_tracker.py @@ -236,37 +236,6 @@ class TestGetWaitTime: assert wait_time == 0.1 # Optimistic default - def test_local_engine_returns_zero(self): - """Test that LocalSearchEngine gets zero wait time.""" - from local_deep_research.config.thread_settings import ( - NoSettingsContextError, - ) - - with patch( - "local_deep_research.web_search_engines.rate_limiting.tracker.get_setting_from_snapshot" - ) as mock_get_setting: - mock_get_setting.side_effect = NoSettingsContextError("No settings") - - with patch( - "local_deep_research.web_search_engines.rate_limiting.tracker.logger" - ): - with patch( - "local_deep_research.web_search_engines.rate_limiting.tracker.get_search_context" - ) as mock_context: - mock_context.return_value = {"username": "test"} - - from local_deep_research.web_search_engines.rate_limiting.tracker import ( - AdaptiveRateLimitTracker, - ) - - tracker = AdaptiveRateLimitTracker(programmatic_mode=True) - tracker.enabled = True - tracker._estimates_loaded = True - - wait_time = tracker.get_wait_time("LocalSearchEngine") - - assert wait_time == 0.0 - def test_known_engine_uses_estimate(self): """Test that known engine uses learned estimate.""" from local_deep_research.config.thread_settings import ( diff --git a/tests/web_search_engines/test_local_embedding_manager.py b/tests/web_search_engines/test_local_embedding_manager.py index e5ff7efee..a76fd0f68 100644 --- a/tests/web_search_engines/test_local_embedding_manager.py +++ b/tests/web_search_engines/test_local_embedding_manager.py @@ -1,204 +1,69 @@ """ -Tests for LocalEmbeddingManager cache directory handling and thread safety. +Tests for LocalEmbeddingManager thread safety and cache directory utilities. -These tests verify that the cache directory is properly resolved -to an absolute path using the application's configured cache directory, -and that embedding initialization is thread-safe. +These tests verify that embedding initialization is thread-safe +and that the cache directory utility returns correct paths. """ import os import tempfile import threading -from pathlib import Path from unittest.mock import MagicMock, patch -class TestLocalEmbeddingManagerCacheDir: - """Tests for LocalEmbeddingManager cache directory configuration.""" - - def test_cache_dir_uses_absolute_path_when_none(self): - """When cache_dir is None, should use get_cache_directory().""" - # Create a temporary directory to use as the cache directory - with tempfile.TemporaryDirectory() as temp_dir: - temp_path = Path(temp_dir) - - # Mock dependencies to avoid loading actual models - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_cache_directory", - return_value=temp_path, - ): - # Also mock the embeddings to avoid loading real models - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager._load_indexed_folders", - return_value={}, - ): - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager( - embedding_model="test-model", - cache_dir=None, # Should use get_cache_directory() - ) - - # Should resolve to temp_path / "local_search" - expected_path = temp_path / "local_search" - assert manager.cache_dir == expected_path - assert manager.cache_dir.is_absolute() - - def test_cache_dir_uses_explicit_path_when_provided(self): - """When cache_dir is provided, should use that path.""" - with tempfile.TemporaryDirectory() as temp_dir: - explicit_path = str(Path(temp_dir) / "my_custom_cache") - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager._load_indexed_folders", - return_value={}, - ): - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - manager = LocalEmbeddingManager( - embedding_model="test-model", - cache_dir=explicit_path, - ) - - assert manager.cache_dir == Path(explicit_path) - - def test_cache_dir_not_relative(self): - """Cache dir should never be a relative path like '.cache'.""" - with tempfile.TemporaryDirectory() as temp_dir: - temp_path = Path(temp_dir) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_cache_directory", - return_value=temp_path, - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager._load_indexed_folders", - return_value={}, - ): - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) - - # Default behavior (cache_dir=None) should NOT result in .cache - manager = LocalEmbeddingManager( - embedding_model="test-model", - ) - - # The path should not start with ".cache" - assert not str(manager.cache_dir).startswith(".cache") - # The path should be absolute - assert manager.cache_dir.is_absolute() - - -class TestLocalSearchEngineCacheDir: - """Tests for LocalSearchEngine cache directory configuration.""" - - def test_from_config_uses_none_for_missing_cache_dir(self): - """from_config should pass None when cache_dir not in config.""" - with tempfile.TemporaryDirectory() as temp_dir: - temp_path = Path(temp_dir) - - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_cache_directory", - return_value=temp_path, - ): - with patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager._load_indexed_folders", - return_value={}, - ): - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalSearchEngine, - ) - - # Create engine from config without cache_dir specified - config = { - "folder_paths": [ - temp_dir - ], # Use temp_dir as a valid folder - } - - engine = LocalSearchEngine.from_config(config) - - # The embedding manager's cache_dir should be absolute - assert engine.embedding_manager.cache_dir.is_absolute() - # Should not be the old relative path - assert not str( - engine.embedding_manager.cache_dir - ).startswith(".cache") - - class TestEmbeddingThreadSafety: """Tests that embedding initialization is thread-safe.""" def test_concurrent_embedding_access_initializes_once(self): """Multiple threads accessing .embeddings should only init once.""" - with tempfile.TemporaryDirectory() as temp_dir: - temp_path = Path(temp_dir) + from local_deep_research.web_search_engines.engines.local_embedding_manager import ( + LocalEmbeddingManager, + ) - with ( - patch( - "local_deep_research.web_search_engines.engines.search_engine_local.get_cache_directory", - return_value=temp_path, - ), - patch( - "local_deep_research.web_search_engines.engines.search_engine_local.LocalEmbeddingManager._load_indexed_folders", - return_value={}, - ), - ): - from local_deep_research.web_search_engines.engines.search_engine_local import ( - LocalEmbeddingManager, - ) + manager = LocalEmbeddingManager( + embedding_model="test-model", + ) - manager = LocalEmbeddingManager( - embedding_model="test-model", - ) + # Track how many times _initialize_embeddings is called + init_count = 0 + init_lock = threading.Lock() + mock_embeddings = MagicMock() - # Track how many times _initialize_embeddings is called - init_count = 0 - init_lock = threading.Lock() - mock_embeddings = MagicMock() + def counting_init(): + nonlocal init_count + with init_lock: + init_count += 1 + # Simulate slow initialization to widen race window + import time - def counting_init(): - nonlocal init_count - with init_lock: - init_count += 1 - # Simulate slow initialization to widen race window - import time + time.sleep(0.1) + return mock_embeddings - time.sleep(0.1) - return mock_embeddings + manager._initialize_embeddings = counting_init - manager._initialize_embeddings = counting_init + # Access embeddings from multiple threads concurrently + results = [] + errors = [] - # Access embeddings from multiple threads concurrently - results = [] - errors = [] + def access_embeddings(): + try: + emb = manager.embeddings + results.append(emb) + except Exception as e: + errors.append(e) - def access_embeddings(): - try: - emb = manager.embeddings - results.append(emb) - except Exception as e: - errors.append(e) + threads = [threading.Thread(target=access_embeddings) for _ in range(4)] + for t in threads: + t.start() + for t in threads: + t.join() - threads = [ - threading.Thread(target=access_embeddings) for _ in range(4) - ] - for t in threads: - t.start() - for t in threads: - t.join() - - assert not errors, f"Threads raised errors: {errors}" - assert init_count == 1, ( - f"_initialize_embeddings called {init_count} times, expected 1" - ) - # All threads should get the same instance - assert all(r is mock_embeddings for r in results) + assert not errors, f"Threads raised errors: {errors}" + assert init_count == 1, ( + f"_initialize_embeddings called {init_count} times, expected 1" + ) + # All threads should get the same instance + assert all(r is mock_embeddings for r in results) class TestGetCacheDirectory: diff --git a/tests/web_search_engines/test_search_engines_config.py b/tests/web_search_engines/test_search_engines_config.py index a6bdb640d..9f03bb3a5 100644 --- a/tests/web_search_engines/test_search_engines_config.py +++ b/tests/web_search_engines/test_search_engines_config.py @@ -326,123 +326,6 @@ class TestSearchConfig: result["custom_retriever"]["class_name"] == "RetrieverSearchEngine" ) - @patch( - "local_deep_research.web_search_engines.retriever_registry.retriever_registry" - ) - @patch( - "local_deep_research.web_search_engines.search_engines_config._get_setting" - ) - def test_processes_local_collections(self, mock_get_setting, mock_registry): - """Should process local collection configurations.""" - mock_registry.list_registered.return_value = [] - - def get_setting_side_effect(key, default, **kwargs): - if key == "search.engine.local": - return { - "my_docs.enabled": True, - "my_docs.paths": '["./docs"]', - } - return default - - mock_get_setting.side_effect = get_setting_side_effect - - from local_deep_research.web_search_engines.search_engines_config import ( - search_config, - ) - - result = search_config() - assert "my_docs" in result - assert result["my_docs"]["requires_llm"] is True - - @patch( - "local_deep_research.web_search_engines.retriever_registry.retriever_registry" - ) - @patch( - "local_deep_research.web_search_engines.search_engines_config._get_setting" - ) - def test_skips_disabled_local_collections( - self, mock_get_setting, mock_registry - ): - """Should skip disabled local collections.""" - mock_registry.list_registered.return_value = [] - - def get_setting_side_effect(key, default, **kwargs): - if key == "search.engine.local": - return { - "disabled_docs.enabled": False, - "disabled_docs.paths": '["./docs"]', - } - return default - - mock_get_setting.side_effect = get_setting_side_effect - - from local_deep_research.web_search_engines.search_engines_config import ( - search_config, - ) - - result = search_config() - assert "disabled_docs" not in result - - @patch( - "local_deep_research.web_search_engines.retriever_registry.retriever_registry" - ) - @patch( - "local_deep_research.web_search_engines.search_engines_config._get_setting" - ) - def test_parses_json_paths_for_local_collection( - self, mock_get_setting, mock_registry - ): - """Should parse JSON array for local collection paths.""" - mock_registry.list_registered.return_value = [] - - def get_setting_side_effect(key, default, **kwargs): - if key == "search.engine.local": - return { - "my_docs.enabled": True, - "my_docs.paths": '["./path1", "./path2"]', - } - return default - - mock_get_setting.side_effect = get_setting_side_effect - - from local_deep_research.web_search_engines.search_engines_config import ( - search_config, - ) - - result = search_config() - assert result["my_docs"]["default_params"]["paths"] == [ - "./path1", - "./path2", - ] - - @patch( - "local_deep_research.web_search_engines.retriever_registry.retriever_registry" - ) - @patch( - "local_deep_research.web_search_engines.search_engines_config._get_setting" - ) - def test_handles_invalid_json_paths(self, mock_get_setting, mock_registry): - """Should handle invalid JSON in paths gracefully.""" - mock_registry.list_registered.return_value = [] - - def get_setting_side_effect(key, default, **kwargs): - if key == "search.engine.local": - return { - "my_docs.enabled": True, - "my_docs.paths": "invalid json", - } - return default - - mock_get_setting.side_effect = get_setting_side_effect - - from local_deep_research.web_search_engines.search_engines_config import ( - search_config, - ) - - result = search_config() - # Should set to empty list on JSON error - assert result["my_docs"]["default_params"]["paths"] == [] - @patch( "local_deep_research.web_search_engines.retriever_registry.retriever_registry" ) @@ -717,140 +600,3 @@ class TestDefaultSearchEngine: default_search_engine(settings_snapshot=snapshot) call_kwargs = mock_get_setting.call_args[1] assert call_kwargs["settings_snapshot"] is snapshot - - -class TestLocalSearchEngines: - """Tests for local_search_engines function.""" - - @patch( - "local_deep_research.web_search_engines.search_engines_config._get_setting" - ) - def test_returns_list_of_enabled_collections(self, mock_get_setting): - """Should return list of enabled local collection names.""" - - def get_setting_side_effect(key, default, **kwargs): - if key == "search.engine.local": - return { - "docs1.enabled": True, - "docs2.enabled": True, - } - return default - - mock_get_setting.side_effect = get_setting_side_effect - - from local_deep_research.web_search_engines.search_engines_config import ( - local_search_engines, - ) - - result = local_search_engines() - assert isinstance(result, list) - assert "docs1" in result - assert "docs2" in result - - @patch( - "local_deep_research.web_search_engines.search_engines_config._get_setting" - ) - def test_excludes_disabled_collections(self, mock_get_setting): - """Should exclude disabled collections.""" - - def get_setting_side_effect(key, default, **kwargs): - if key == "search.engine.local": - return { - "enabled_docs.enabled": True, - "disabled_docs.enabled": False, - } - return default - - mock_get_setting.side_effect = get_setting_side_effect - - from local_deep_research.web_search_engines.search_engines_config import ( - local_search_engines, - ) - - result = local_search_engines() - assert "enabled_docs" in result - assert "disabled_docs" not in result - - @patch( - "local_deep_research.web_search_engines.search_engines_config._get_setting" - ) - def test_excludes_local_all_collection(self, mock_get_setting): - """Should exclude the 'local_all' collection.""" - - def get_setting_side_effect(key, default, **kwargs): - if key == "search.engine.local": - return { - "local_all.enabled": True, - "my_docs.enabled": True, - } - return default - - mock_get_setting.side_effect = get_setting_side_effect - - from local_deep_research.web_search_engines.search_engines_config import ( - local_search_engines, - ) - - result = local_search_engines() - assert "local_all" not in result - assert "my_docs" in result - - @patch( - "local_deep_research.web_search_engines.search_engines_config._get_setting" - ) - def test_returns_empty_list_when_no_local_engines(self, mock_get_setting): - """Should return empty list when no local engines configured.""" - mock_get_setting.return_value = {} - - from local_deep_research.web_search_engines.search_engines_config import ( - local_search_engines, - ) - - result = local_search_engines() - assert result == [] - - @patch( - "local_deep_research.web_search_engines.search_engines_config._get_setting" - ) - def test_treats_missing_enabled_as_true(self, mock_get_setting): - """Should treat missing 'enabled' field as True (default enabled).""" - - def get_setting_side_effect(key, default, **kwargs): - if key == "search.engine.local": - return { - "implicit_enabled.paths": '["./docs"]', - } - return default - - mock_get_setting.side_effect = get_setting_side_effect - - from local_deep_research.web_search_engines.search_engines_config import ( - local_search_engines, - ) - - result = local_search_engines() - assert "implicit_enabled" in result - - @patch( - "local_deep_research.web_search_engines.search_engines_config._get_setting" - ) - def test_passes_all_parameters_to_get_setting(self, mock_get_setting): - """Should pass username, db_session, and settings_snapshot.""" - mock_get_setting.return_value = {} - mock_session = MagicMock() - snapshot = {"test": "value"} - - from local_deep_research.web_search_engines.search_engines_config import ( - local_search_engines, - ) - - local_search_engines( - username="testuser", - db_session=mock_session, - settings_snapshot=snapshot, - ) - - call_kwargs = mock_get_setting.call_args[1] - assert call_kwargs["username"] == "testuser" - assert call_kwargs["db_session"] is mock_session - assert call_kwargs["settings_snapshot"] is snapshot diff --git a/unraid-templates/local-deep-research.xml b/unraid-templates/local-deep-research.xml index 0b87cb489..3acf267b3 100644 --- a/unraid-templates/local-deep-research.xml +++ b/unraid-templates/local-deep-research.xml @@ -35,9 +35,6 @@ This template includes the main LDR service. For full functionality, you may als 5000 /mnt/user/appdata/local-deep-research/data /mnt/user/appdata/local-deep-research/scripts - - - 0.0.0.0 5000 /data