local-deep-research/examples/optimization/run_multi_benchmark.py

#!/usr/bin/env python
"""
Multi-benchmark optimization example for Local Deep Research.

This script demonstrates how to run optimization with multiple benchmark types
and custom weights between them.

Usage:
    # Run from project root with venv activated
    cd /path/to/local-deep-research
    source .venv/bin/activate
    cd src
    python ../examples/optimization/run_multi_benchmark.py
"""

import os
import sys
from datetime import datetime, UTC
from pathlib import Path
from typing import Any, Dict

from loguru import logger

# Add src directory to Python path
src_dir = str((Path(__file__).parent.parent / "src").resolve())
if src_dir not in sys.path:
    sys.path.insert(0, src_dir)

# Use environment variables for configuration
# The system should be configured with proper environment variables:
# - ANTHROPIC_API_KEY for Anthropic API access
# - OPENROUTER_API_KEY for OpenRouter API access (if used)
# - LDR_DATA_DIR for data directory location (if needed)
data_dir = os.environ.get("LDR_DATA_DIR", str(Path(src_dir) / "data"))

# Import benchmark optimization functions
try:
    from local_deep_research.benchmarks.optimization.api import (
        optimize_parameters,
    )

    print("Successfully imported optimization API")
except ImportError as e:
    print(f"Error importing optimization API: {e}")
    print("Current sys.path:", sys.path)
    sys.exit(1)


def print_optimization_results(params: Dict[str, Any], score: float):
    """Print optimization results in a nicely formatted way."""
    print("\n" + "=" * 50)
    print(" OPTIMIZATION RESULTS ")
    print("=" * 50)
    print(f"SCORE: {score:.4f}")
    print("\nBest Parameters:")
    for param, value in params.items():
        print(f"  {param}: {value}")
    print("=" * 50 + "\n")


def main():
    """Run multi-benchmark optimization examples."""
    # Create a timestamp-based directory for results
    timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")

    # Put results in the data directory for easier access
    if Path(data_dir).is_dir():
        output_dir = str(
            Path(data_dir)
            / "optimization_results"
            / f"multi_benchmark_{timestamp}"
        )
    else:
        output_dir = str(
            Path("optimization_results") / f"multi_benchmark_{timestamp}"
        )

    os.makedirs(output_dir, exist_ok=True)
    print(f"Results will be saved to: {output_dir}")

    print("\n🔬 Multi-Benchmark Optimization Example 🔬")
    print("Results will be saved to: " + output_dir)

    # Define a very small parameter space for testing
    tiny_param_space = {
        "iterations": {
            "type": "int",
            "low": 1,
            "high": 3,
            "step": 1,
        },
        "questions_per_iteration": {
            "type": "int",
            "low": 1,
            "high": 3,
            "step": 1,
        },
        "search_strategy": {
            "type": "categorical",
            "choices": ["iterdrag", "rapid", "parallel"],
        },
    }

    # Example query for running optimization
    query = "Recent developments in fusion energy research"

    # Very small parameter space for quick testing
    tiny_param_space = {
        "iterations": {
            "type": "int",
            "low": 1,
            "high": 2,
            "step": 1,
        },
        "questions_per_iteration": {
            "type": "int",
            "low": 1,
            "high": 2,
            "step": 1,
        },
        "search_strategy": {
            "type": "categorical",
            "choices": ["rapid"],
        },
    }

    # Run 1: SimpleQA benchmark only with minimal trials
    print("\n🔍 Running SimpleQA-only optimization (minimal test)...")
    try:
        # Use very minimal settings for testing
        mini_system_config = {
            "iterations": 1,
            "questions_per_iteration": 1,
            "search_strategy": "rapid",
            "max_results": 2,  # Very few results
            "search_tool": "wikipedia",  # Fast search engine
            "timeout": 5,  # Extremely short timeout to speed up demo
        }

        # Import the evaluator directly for faster testing
        from local_deep_research.benchmarks.evaluators import (
            CompositeBenchmarkEvaluator,
        )

        print("Creating benchmark evaluator with SimpleQA only")
        evaluator = CompositeBenchmarkEvaluator({"simpleqa": 1.0})

        print("Running single benchmark evaluation (no optimization)...")
        quality_results = evaluator.evaluate(
            system_config=mini_system_config,
            num_examples=1,  # Use just 1 example for speed
            output_dir=str(Path(output_dir) / "simpleqa_test"),
        )

        print("Benchmark evaluation complete!")
        print(f"Quality score: {quality_results.get('quality_score', 0.0):.4f}")
        print(
            "Benchmark weights used:",
            quality_results.get("benchmark_weights", {}),
        )
        print(
            "Individual benchmark results:",
            list(quality_results.get("benchmark_results", {}).keys()),
        )

        # Also run the Optuna optimizer with minimal settings
        print("\nRunning minimal Optuna optimization...")
        params1, score1 = optimize_parameters(
            query=query,
            param_space=tiny_param_space,  # Use tiny param space
            output_dir=str(Path(output_dir) / "simpleqa_only"),
            n_trials=1,  # Just one trial for testing
            benchmark_weights={"simpleqa": 1.0},  # SimpleQA only
            timeout=5,  # Limit to 5 seconds
        )
        print_optimization_results(params1, score1)
    except Exception as e:
        logger.exception("Error running SimpleQA optimization")
        print(f"Error: {e}")

    # Run 2: BrowseComp benchmark only (minimal test)
    print("\n🔍 Running BrowseComp-only benchmark (minimal test)...")
    try:
        print("Creating benchmark evaluator with BrowseComp only")
        browsecomp_evaluator = CompositeBenchmarkEvaluator({"browsecomp": 1.0})

        print("Running single BrowseComp evaluation (no optimization)...")
        bc_results = browsecomp_evaluator.evaluate(
            system_config=mini_system_config,
            num_examples=1,  # Just 1 example for speed
            output_dir=str(Path(output_dir) / "browsecomp_test"),
        )

        print("BrowseComp evaluation complete!")
        print(f"Quality score: {bc_results.get('quality_score', 0.0):.4f}")
        print(
            "Benchmark weights used:", bc_results.get("benchmark_weights", {})
        )
        print(
            "Individual benchmark results:",
            list(bc_results.get("benchmark_results", {}).keys()),
        )

    except Exception as e:
        logger.exception("Error running BrowseComp evaluation")
        print(f"Error: {e}")

    # Run 3: Combined benchmark with weights (minimal test)
    print(
        "\n🔍 Running combined benchmarks with weights (60% SimpleQA, 40% BrowseComp)..."
    )
    try:
        print("Creating composite benchmark evaluator with weights")
        composite_evaluator = CompositeBenchmarkEvaluator(
            {"simpleqa": 0.6, "browsecomp": 0.4}
        )

        print("Running combined benchmark evaluation (no optimization)...")
        combo_results = composite_evaluator.evaluate(
            system_config=mini_system_config,
            num_examples=1,  # Just 1 example for speed
            output_dir=str(Path(output_dir) / "combined_test"),
        )

        print("Combined benchmark evaluation complete!")
        print(f"Quality score: {combo_results.get('quality_score', 0.0):.4f}")
        print(
            "Benchmark weights used:",
            combo_results.get("benchmark_weights", {}),
        )
        print(
            "Individual benchmark results:",
            list(combo_results.get("benchmark_results", {}).keys()),
        )

    except Exception as e:
        logger.exception("Error running combined benchmark evaluation")
        print(f"Error: {e}")

    # Run 4: Combined benchmark with speed optimization
    print("\n🔍 Running combined benchmarks with speed optimization...")
    try:
        # Import the necessary function
        from local_deep_research.benchmarks.optimization.api import (
            optimize_for_speed,
        )

        print("Running speed optimization with multi-benchmark weights...")
        # Very minimal run with just 1 trial for demonstration
        params_speed, score_speed = optimize_for_speed(
            query=query,
            output_dir=str(Path(output_dir) / "speed_optimization"),
            n_trials=1,  # Just one trial for testing
            benchmark_weights={"simpleqa": 0.6, "browsecomp": 0.4},
            timeout=5,  # Limit to 5 seconds
        )

        print("Speed optimization with multi-benchmark complete!")
        print_optimization_results(params_speed, score_speed)
        print("Speed metrics weighting: Quality (20%), Speed (80%)")

    except Exception as e:
        logger.exception(
            "Error running speed optimization with multi-benchmark"
        )
        print(f"Error: {e}")

    # Run 5: Combined benchmark with efficiency optimization (balancing quality, speed and resources)
    print("\n🔍 Running combined benchmarks with efficiency optimization...")
    try:
        # Import the necessary function
        from local_deep_research.benchmarks.optimization.api import (
            optimize_for_efficiency,
        )

        print("Running efficiency optimization with multi-benchmark weights...")
        # Very minimal run with just 1 trial for demonstration
        params_efficiency, score_efficiency = optimize_for_efficiency(
            query=query,
            output_dir=str(Path(output_dir) / "efficiency_optimization"),
            n_trials=1,  # Just one trial for testing
            benchmark_weights={"simpleqa": 0.6, "browsecomp": 0.4},
            timeout=5,  # Limit to 5 seconds
        )

        print("Efficiency optimization with multi-benchmark complete!")
        print_optimization_results(params_efficiency, score_efficiency)
        print(
            "Efficiency metrics combine quality (40%), speed (30%), and resource usage (30%)"
        )

    except Exception as e:
        logger.exception(
            "Error running efficiency optimization with multi-benchmark"
        )
        print(f"Error: {e}")

    print("\nSkipping full optimization runs for time constraints.")
    print("The system fully supports:")
    print(
        "  1. BrowseComp-only optimization with benchmark_weights={'browsecomp': 1.0}"
    )
    print(
        "  2. Combined benchmarks with weights benchmark_weights={'simpleqa': 0.6, 'browsecomp': 0.4}"
    )
    print(
        "  3. Speed optimization with benchmark_weights using optimize_for_speed()"
    )
    print(
        "  4. Efficiency optimization with benchmark_weights using optimize_for_efficiency()"
    )
    print("\nThese would use the same API as demonstrated above.")

    print(f"\nAll optimization runs completed. Results saved to {output_dir}")
    print("Note: For serious optimization runs, increase n_trials to 20+")


if __name__ == "__main__":
    main()