mirror of
https://github.com/LearningCircuit/local-deep-research.git
synced 2026-06-15 19:46:56 +03:00
* feat: Add pre-commit hook to enforce pathlib usage (issue #640) - Created check-pathlib-usage.py pre-commit hook using AST parsing - Detects os.path usage and suggests pathlib alternatives - Fixed os.path.normpath usage in auth/routes.py to use PurePosixPath - Added hook configuration to .pre-commit-config.yaml The hook provides helpful suggestions for replacing os.path calls with their pathlib equivalents for better cross-platform compatibility. Co-Authored-By: djpetti <djpetti@users.noreply.github.com> * feat: Add missing pathlib pre-commit hook script Co-Authored-By: djpetti <djpetti@users.noreply.github.com> * refactor: Migrate core src modules from os.path to pathlib - Fixed web/app_factory.py, config/llm_config.py, metrics/token_counter.py - Fixed utilities/es_utils.py, web/routes/benchmark_routes.py - Fixed web/routes/settings_routes.py, web_search_engines/engines/search_engine_local.py - Replaced os.path.join() with Path() / syntax - Replaced os.path.exists() with Path().exists() - Replaced os.path.basename() with Path().name - Replaced os.path.dirname() with Path().parent Part of the migration to modern pathlib API for better cross-platform compatibility and cleaner code. Co-Authored-By: djpetti <djpetti@users.noreply.github.com> * refactor: Migrate from os.path to pathlib in src and tests (issue #640) Replaced os.path usage with pathlib.Path throughout: - src/local_deep_research/benchmarks: All os.path.join, exists, dirname, basename, abspath replaced - tests directory: Complete migration of all test files - Improved cross-platform compatibility and code readability - Kept os.path.expandvars in env_settings.py (no pathlib equivalent) Part of pre-commit hook enforcement for pathlib usage. Remaining work: examples/ and scripts/ directories. Co-Authored-By: djpetti * fix: Complete migration from os.path to pathlib.Path (issue #640) Completed manual migration of all os.path usage to pathlib.Path across: - scripts/ directory (3 files) - examples/ directory (25 files total) - examples/benchmarks/ (8 files) - examples/optimization/ (16 files) - examples/show_env_vars.py - src/local_deep_research/settings/env_settings.py Changes made: - Replaced os.path.join() with Path() / syntax - Replaced os.path.exists() with Path().exists() - Replaced os.path.dirname() with Path().parent - Replaced os.path.basename() with Path().name or Path().stem - Replaced os.path.abspath() with Path().resolve() - Replaced os.makedirs() with Path().mkdir(parents=True, exist_ok=True) - Added pathlib import where needed Note: Kept os.path.expandvars in env_settings.py as there is no pathlib equivalent. Added comment explaining this limitation. This completes the pathlib migration for issue #640. Co-Authored-By: djpetti * fix: Allow os.path.expandvars in pathlib pre-commit hook Updated the check-pathlib-usage.py pre-commit hook to skip checking os.path.expandvars since it has no pathlib equivalent. Changes: - Added exception for expandvars in both visit_Attribute and visit_Call methods - Added comment in equivalents dictionary noting expandvars is allowed - This allows env_settings.py to use os.path.expandvars without failing checks This resolves the pre-commit CI failure while maintaining the pathlib enforcement for all other os.path methods. Co-Authored-By: djpetti --------- Co-authored-by: djpetti
320 lines
11 KiB
Python
Executable File
320 lines
11 KiB
Python
Executable File
#!/usr/bin/env python
|
|
"""
|
|
Multi-benchmark optimization example for Local Deep Research.
|
|
|
|
This script demonstrates how to run optimization with multiple benchmark types
|
|
and custom weights between them.
|
|
|
|
Usage:
|
|
# Run from project root with venv activated
|
|
cd /path/to/local-deep-research
|
|
source .venv/bin/activate
|
|
cd src
|
|
python ../examples/optimization/run_multi_benchmark.py
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from datetime import datetime, UTC
|
|
from pathlib import Path
|
|
from typing import Any, Dict
|
|
|
|
from loguru import logger
|
|
|
|
# Add src directory to Python path
|
|
src_dir = str((Path(__file__).parent.parent / "src").resolve())
|
|
if src_dir not in sys.path:
|
|
sys.path.insert(0, src_dir)
|
|
|
|
# Use environment variables for configuration
|
|
# The system should be configured with proper environment variables:
|
|
# - ANTHROPIC_API_KEY for Anthropic API access
|
|
# - OPENROUTER_API_KEY for OpenRouter API access (if used)
|
|
# - LDR_DATA_DIR for data directory location (if needed)
|
|
data_dir = os.environ.get("LDR_DATA_DIR", str(Path(src_dir) / "data"))
|
|
|
|
# Import benchmark optimization functions
|
|
try:
|
|
from local_deep_research.benchmarks.optimization.api import (
|
|
optimize_parameters,
|
|
)
|
|
|
|
print("Successfully imported optimization API")
|
|
except ImportError as e:
|
|
print(f"Error importing optimization API: {e}")
|
|
print("Current sys.path:", sys.path)
|
|
sys.exit(1)
|
|
|
|
|
|
def print_optimization_results(params: Dict[str, Any], score: float):
|
|
"""Print optimization results in a nicely formatted way."""
|
|
print("\n" + "=" * 50)
|
|
print(" OPTIMIZATION RESULTS ")
|
|
print("=" * 50)
|
|
print(f"SCORE: {score:.4f}")
|
|
print("\nBest Parameters:")
|
|
for param, value in params.items():
|
|
print(f" {param}: {value}")
|
|
print("=" * 50 + "\n")
|
|
|
|
|
|
def main():
|
|
"""Run multi-benchmark optimization examples."""
|
|
# Create a timestamp-based directory for results
|
|
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
|
|
|
|
# Put results in the data directory for easier access
|
|
if Path(data_dir).is_dir():
|
|
output_dir = str(
|
|
Path(data_dir)
|
|
/ "optimization_results"
|
|
/ f"multi_benchmark_{timestamp}"
|
|
)
|
|
else:
|
|
output_dir = str(
|
|
Path("optimization_results") / f"multi_benchmark_{timestamp}"
|
|
)
|
|
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
print(f"Results will be saved to: {output_dir}")
|
|
|
|
print("\n🔬 Multi-Benchmark Optimization Example 🔬")
|
|
print("Results will be saved to: " + output_dir)
|
|
|
|
# Define a very small parameter space for testing
|
|
tiny_param_space = {
|
|
"iterations": {
|
|
"type": "int",
|
|
"low": 1,
|
|
"high": 3,
|
|
"step": 1,
|
|
},
|
|
"questions_per_iteration": {
|
|
"type": "int",
|
|
"low": 1,
|
|
"high": 3,
|
|
"step": 1,
|
|
},
|
|
"search_strategy": {
|
|
"type": "categorical",
|
|
"choices": ["iterdrag", "rapid", "parallel"],
|
|
},
|
|
}
|
|
|
|
# Example query for running optimization
|
|
query = "Recent developments in fusion energy research"
|
|
|
|
# Very small parameter space for quick testing
|
|
tiny_param_space = {
|
|
"iterations": {
|
|
"type": "int",
|
|
"low": 1,
|
|
"high": 2,
|
|
"step": 1,
|
|
},
|
|
"questions_per_iteration": {
|
|
"type": "int",
|
|
"low": 1,
|
|
"high": 2,
|
|
"step": 1,
|
|
},
|
|
"search_strategy": {
|
|
"type": "categorical",
|
|
"choices": ["rapid"],
|
|
},
|
|
}
|
|
|
|
# Run 1: SimpleQA benchmark only with minimal trials
|
|
print("\n🔍 Running SimpleQA-only optimization (minimal test)...")
|
|
try:
|
|
# Use very minimal settings for testing
|
|
mini_system_config = {
|
|
"iterations": 1,
|
|
"questions_per_iteration": 1,
|
|
"search_strategy": "rapid",
|
|
"max_results": 2, # Very few results
|
|
"search_tool": "wikipedia", # Fast search engine
|
|
"timeout": 5, # Extremely short timeout to speed up demo
|
|
}
|
|
|
|
# Import the evaluator directly for faster testing
|
|
from local_deep_research.benchmarks.evaluators import (
|
|
CompositeBenchmarkEvaluator,
|
|
)
|
|
|
|
print("Creating benchmark evaluator with SimpleQA only")
|
|
evaluator = CompositeBenchmarkEvaluator({"simpleqa": 1.0})
|
|
|
|
print("Running single benchmark evaluation (no optimization)...")
|
|
quality_results = evaluator.evaluate(
|
|
system_config=mini_system_config,
|
|
num_examples=1, # Use just 1 example for speed
|
|
output_dir=str(Path(output_dir) / "simpleqa_test"),
|
|
)
|
|
|
|
print("Benchmark evaluation complete!")
|
|
print(f"Quality score: {quality_results.get('quality_score', 0.0):.4f}")
|
|
print(
|
|
"Benchmark weights used:",
|
|
quality_results.get("benchmark_weights", {}),
|
|
)
|
|
print(
|
|
"Individual benchmark results:",
|
|
list(quality_results.get("benchmark_results", {}).keys()),
|
|
)
|
|
|
|
# Also run the Optuna optimizer with minimal settings
|
|
print("\nRunning minimal Optuna optimization...")
|
|
params1, score1 = optimize_parameters(
|
|
query=query,
|
|
param_space=tiny_param_space, # Use tiny param space
|
|
output_dir=str(Path(output_dir) / "simpleqa_only"),
|
|
n_trials=1, # Just one trial for testing
|
|
benchmark_weights={"simpleqa": 1.0}, # SimpleQA only
|
|
timeout=5, # Limit to 5 seconds
|
|
)
|
|
print_optimization_results(params1, score1)
|
|
except Exception as e:
|
|
logger.exception("Error running SimpleQA optimization")
|
|
print(f"Error: {e}")
|
|
|
|
# Run 2: BrowseComp benchmark only (minimal test)
|
|
print("\n🔍 Running BrowseComp-only benchmark (minimal test)...")
|
|
try:
|
|
print("Creating benchmark evaluator with BrowseComp only")
|
|
browsecomp_evaluator = CompositeBenchmarkEvaluator({"browsecomp": 1.0})
|
|
|
|
print("Running single BrowseComp evaluation (no optimization)...")
|
|
bc_results = browsecomp_evaluator.evaluate(
|
|
system_config=mini_system_config,
|
|
num_examples=1, # Just 1 example for speed
|
|
output_dir=str(Path(output_dir) / "browsecomp_test"),
|
|
)
|
|
|
|
print("BrowseComp evaluation complete!")
|
|
print(f"Quality score: {bc_results.get('quality_score', 0.0):.4f}")
|
|
print(
|
|
"Benchmark weights used:", bc_results.get("benchmark_weights", {})
|
|
)
|
|
print(
|
|
"Individual benchmark results:",
|
|
list(bc_results.get("benchmark_results", {}).keys()),
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception("Error running BrowseComp evaluation")
|
|
print(f"Error: {e}")
|
|
|
|
# Run 3: Combined benchmark with weights (minimal test)
|
|
print(
|
|
"\n🔍 Running combined benchmarks with weights (60% SimpleQA, 40% BrowseComp)..."
|
|
)
|
|
try:
|
|
print("Creating composite benchmark evaluator with weights")
|
|
composite_evaluator = CompositeBenchmarkEvaluator(
|
|
{"simpleqa": 0.6, "browsecomp": 0.4}
|
|
)
|
|
|
|
print("Running combined benchmark evaluation (no optimization)...")
|
|
combo_results = composite_evaluator.evaluate(
|
|
system_config=mini_system_config,
|
|
num_examples=1, # Just 1 example for speed
|
|
output_dir=str(Path(output_dir) / "combined_test"),
|
|
)
|
|
|
|
print("Combined benchmark evaluation complete!")
|
|
print(f"Quality score: {combo_results.get('quality_score', 0.0):.4f}")
|
|
print(
|
|
"Benchmark weights used:",
|
|
combo_results.get("benchmark_weights", {}),
|
|
)
|
|
print(
|
|
"Individual benchmark results:",
|
|
list(combo_results.get("benchmark_results", {}).keys()),
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception("Error running combined benchmark evaluation")
|
|
print(f"Error: {e}")
|
|
|
|
# Run 4: Combined benchmark with speed optimization
|
|
print("\n🔍 Running combined benchmarks with speed optimization...")
|
|
try:
|
|
# Import the necessary function
|
|
from local_deep_research.benchmarks.optimization.api import (
|
|
optimize_for_speed,
|
|
)
|
|
|
|
print("Running speed optimization with multi-benchmark weights...")
|
|
# Very minimal run with just 1 trial for demonstration
|
|
params_speed, score_speed = optimize_for_speed(
|
|
query=query,
|
|
output_dir=str(Path(output_dir) / "speed_optimization"),
|
|
n_trials=1, # Just one trial for testing
|
|
benchmark_weights={"simpleqa": 0.6, "browsecomp": 0.4},
|
|
timeout=5, # Limit to 5 seconds
|
|
)
|
|
|
|
print("Speed optimization with multi-benchmark complete!")
|
|
print_optimization_results(params_speed, score_speed)
|
|
print("Speed metrics weighting: Quality (20%), Speed (80%)")
|
|
|
|
except Exception as e:
|
|
logger.exception(
|
|
"Error running speed optimization with multi-benchmark"
|
|
)
|
|
print(f"Error: {e}")
|
|
|
|
# Run 5: Combined benchmark with efficiency optimization (balancing quality, speed and resources)
|
|
print("\n🔍 Running combined benchmarks with efficiency optimization...")
|
|
try:
|
|
# Import the necessary function
|
|
from local_deep_research.benchmarks.optimization.api import (
|
|
optimize_for_efficiency,
|
|
)
|
|
|
|
print("Running efficiency optimization with multi-benchmark weights...")
|
|
# Very minimal run with just 1 trial for demonstration
|
|
params_efficiency, score_efficiency = optimize_for_efficiency(
|
|
query=query,
|
|
output_dir=str(Path(output_dir) / "efficiency_optimization"),
|
|
n_trials=1, # Just one trial for testing
|
|
benchmark_weights={"simpleqa": 0.6, "browsecomp": 0.4},
|
|
timeout=5, # Limit to 5 seconds
|
|
)
|
|
|
|
print("Efficiency optimization with multi-benchmark complete!")
|
|
print_optimization_results(params_efficiency, score_efficiency)
|
|
print(
|
|
"Efficiency metrics combine quality (40%), speed (30%), and resource usage (30%)"
|
|
)
|
|
|
|
except Exception as e:
|
|
logger.exception(
|
|
"Error running efficiency optimization with multi-benchmark"
|
|
)
|
|
print(f"Error: {e}")
|
|
|
|
print("\nSkipping full optimization runs for time constraints.")
|
|
print("The system fully supports:")
|
|
print(
|
|
" 1. BrowseComp-only optimization with benchmark_weights={'browsecomp': 1.0}"
|
|
)
|
|
print(
|
|
" 2. Combined benchmarks with weights benchmark_weights={'simpleqa': 0.6, 'browsecomp': 0.4}"
|
|
)
|
|
print(
|
|
" 3. Speed optimization with benchmark_weights using optimize_for_speed()"
|
|
)
|
|
print(
|
|
" 4. Efficiency optimization with benchmark_weights using optimize_for_efficiency()"
|
|
)
|
|
print("\nThese would use the same API as demonstrated above.")
|
|
|
|
print(f"\nAll optimization runs completed. Results saved to {output_dir}")
|
|
print("Note: For serious optimization runs, increase n_trials to 20+")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|