local-deep-research/examples/optimization/run_parallel_benchmark.py

#!/usr/bin/env python
"""
Run SimpleQA and BrowseComp benchmarks in parallel with 300 examples each.

This script demonstrates running multiple benchmarks in parallel with a large number of examples.

Usage:
    # Install dependencies with PDM
    cd /path/to/local-deep-research
    pdm install

    # Run the script with PDM
    pdm run python examples/optimization/run_parallel_benchmark.py
"""

import argparse
import concurrent.futures
import os
import sys
import time
from datetime import datetime, UTC
from pathlib import Path

from loguru import logger

# Add the src directory to the Python path
project_root = str(Path(__file__).parent.parent.parent.resolve())
sys.path.insert(0, str(Path(project_root) / "src"))


def run_simpleqa_benchmark(
    num_examples,
    output_dir,
    model=None,
    provider=None,
    endpoint_url=None,
    api_key=None,
):
    """Run SimpleQA benchmark with specified number of examples."""
    from local_deep_research.benchmarks.benchmark_functions import (
        evaluate_simpleqa,
    )

    logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
    start_time = time.time()

    # Run the benchmark
    results = evaluate_simpleqa(
        num_examples=num_examples,
        search_iterations=2,
        questions_per_iteration=3,
        search_strategy="source_based",
        search_tool="searxng",
        search_model=model,
        search_provider=provider,
        endpoint_url=endpoint_url,
        output_dir=str(Path(output_dir) / "simpleqa"),
        evaluation_provider="ANTHROPIC",
        evaluation_model="claude-3-7-sonnet-20250219",
    )

    duration = time.time() - start_time
    logger.info(f"SimpleQA benchmark completed in {duration:.1f} seconds")

    if results and isinstance(results, dict):
        logger.info(f"SimpleQA accuracy: {results.get('accuracy', 'N/A')}")

    return results


def run_browsecomp_benchmark(
    num_examples,
    output_dir,
    model=None,
    provider=None,
    endpoint_url=None,
    api_key=None,
):
    """Run BrowseComp benchmark with specified number of examples."""
    from local_deep_research.benchmarks.benchmark_functions import (
        evaluate_browsecomp,
    )

    logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
    start_time = time.time()

    # Run the benchmark
    results = evaluate_browsecomp(
        num_examples=num_examples,
        search_iterations=3,
        questions_per_iteration=3,
        search_strategy="source_based",
        search_tool="searxng",
        search_model=model,
        search_provider=provider,
        endpoint_url=endpoint_url,
        output_dir=str(Path(output_dir) / "browsecomp"),
        evaluation_provider="ANTHROPIC",
        evaluation_model="claude-3-7-sonnet-20250219",
    )

    duration = time.time() - start_time
    logger.info(f"BrowseComp benchmark completed in {duration:.1f} seconds")

    if results and isinstance(results, dict):
        logger.info(f"BrowseComp accuracy: {results.get('accuracy', 'N/A')}")

    return results


def setup_llm_environment(
    model=None, provider=None, endpoint_url=None, api_key=None
):
    """Set up environment variables for LLM configuration."""
    if model:
        os.environ["LDR_LLM__MODEL"] = model
        logger.info(f"Using LLM model: {model}")

    if provider:
        os.environ["LDR_LLM__PROVIDER"] = provider
        logger.info(f"Using LLM provider: {provider}")

    if endpoint_url:
        os.environ["OPENAI_ENDPOINT_URL"] = endpoint_url
        os.environ["LDR_LLM__OPENAI_ENDPOINT_URL"] = endpoint_url
        logger.info(f"Using endpoint URL: {endpoint_url}")

    if api_key:
        # Set the appropriate environment variable based on provider
        if provider == "openai":
            os.environ["OPENAI_API_KEY"] = api_key
            os.environ["LDR_LLM__OPENAI_API_KEY"] = api_key
        elif provider == "openai_endpoint":
            os.environ["OPENAI_ENDPOINT_API_KEY"] = api_key
            os.environ["LDR_LLM__OPENAI_ENDPOINT_API_KEY"] = api_key
        elif provider == "anthropic":
            os.environ["ANTHROPIC_API_KEY"] = api_key
            os.environ["LDR_LLM__ANTHROPIC_API_KEY"] = api_key

        logger.info("API key configured")


def main():
    parser = argparse.ArgumentParser(
        description="Run SimpleQA and BrowseComp benchmarks in parallel"
    )
    parser.add_argument(
        "--examples",
        type=int,
        default=300,
        help="Number of examples for each benchmark (default: 300)",
    )

    # LLM configuration options
    parser.add_argument(
        "--model",
        help="Model name for the LLM (e.g., 'claude-3-sonnet-20240229')",
    )
    parser.add_argument(
        "--provider",
        help="Provider for the LLM (e.g., 'anthropic', 'openai', 'openai_endpoint')",
    )
    parser.add_argument(
        "--endpoint-url",
        help="Custom endpoint URL (e.g., 'https://openrouter.ai/api/v1')",
    )
    parser.add_argument("--api-key", help="API key for the LLM provider")

    args = parser.parse_args()

    # Create timestamp for unique output directory
    timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
    output_dir = str(
        Path(project_root)
        / "benchmark_results"
        / f"parallel_benchmark_{timestamp}"
    )
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    # Display start information
    print(f"Starting parallel benchmarks with {args.examples} examples each")
    print(f"Results will be saved to: {output_dir}")

    # Set up LLM environment if specified
    setup_llm_environment(
        model=args.model,
        provider=args.provider,
        endpoint_url=args.endpoint_url,
        api_key=args.api_key,
    )

    # Start time for total execution
    total_start_time = time.time()

    # Run benchmarks in parallel using ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
        # Submit both benchmark jobs
        simpleqa_future = executor.submit(
            run_simpleqa_benchmark,
            args.examples,
            output_dir,
            args.model,
            args.provider,
            args.endpoint_url,
            args.api_key,
        )

        browsecomp_future = executor.submit(
            run_browsecomp_benchmark,
            args.examples,
            output_dir,
            args.model,
            args.provider,
            args.endpoint_url,
            args.api_key,
        )

        # Get results from both futures
        try:
            simpleqa_results = simpleqa_future.result()
            print("SimpleQA benchmark completed successfully")
        except Exception:
            logger.exception("Error in SimpleQA benchmark")
            simpleqa_results = None

        try:
            browsecomp_results = browsecomp_future.result()
            print("BrowseComp benchmark completed successfully")
        except Exception:
            logger.exception("Error in BrowseComp benchmark")
            browsecomp_results = None

    # Calculate total time
    total_duration = time.time() - total_start_time

    # Print summary
    print("\n" + "=" * 50)
    print(" PARALLEL BENCHMARK SUMMARY ")
    print("=" * 50)
    print(f"Total duration: {total_duration:.1f} seconds")
    print(f"Examples per benchmark: {args.examples}")

    if simpleqa_results and isinstance(simpleqa_results, dict):
        print(f"SimpleQA accuracy: {simpleqa_results.get('accuracy', 'N/A')}")
    else:
        print("SimpleQA: Failed or no results")

    if browsecomp_results and isinstance(browsecomp_results, dict):
        print(
            f"BrowseComp accuracy: {browsecomp_results.get('accuracy', 'N/A')}"
        )
    else:
        print("BrowseComp: Failed or no results")

    print(f"Results saved to: {output_dir}")
    print("=" * 50)

    # Save summary to JSON file
    try:
        import json

        summary = {
            "timestamp": timestamp,
            "examples_per_benchmark": args.examples,
            "total_duration": total_duration,
            "simpleqa": {
                "accuracy": (
                    simpleqa_results.get("accuracy")
                    if simpleqa_results
                    else None
                ),
                "completed": simpleqa_results is not None,
            },
            "browsecomp": {
                "accuracy": (
                    browsecomp_results.get("accuracy")
                    if browsecomp_results
                    else None
                ),
                "completed": browsecomp_results is not None,
            },
            "model": args.model,
            "provider": args.provider,
        }

        with open(
            Path(output_dir) / "parallel_benchmark_summary.json", "w"
        ) as f:
            json.dump(summary, f, indent=2)

    except Exception:
        logger.exception("Error saving summary")

    return 0


if __name__ == "__main__":
    sys.exit(main())