local-deep-research/examples/optimization/multi_benchmark_simulation.py

"""
Multi-benchmark optimization simulation.

This script demonstrates how to use multi-benchmark optimization with weighted scores
without actually running real benchmarks (just simulation).
"""

import json
import random
import time
from datetime import datetime, UTC
from pathlib import Path
from typing import Any, Dict, Optional, Tuple

from loguru import logger


class BenchmarkSimulator:
    """Simulates running benchmarks without actually executing them."""

    def __init__(
        self, name: str, quality_bias: float = 0.7, speed_factor: float = 0.2
    ):
        """
        Initialize benchmark simulator.

        Args:
            name: Name of the benchmark
            quality_bias: Base quality score (will be adjusted by parameters)
            speed_factor: How much iterations affect speed (higher = more sensitive)
        """
        self.name = name
        self.quality_bias = quality_bias
        self.speed_factor = speed_factor

    def evaluate(self, params: Dict[str, Any]) -> Dict[str, Any]:
        """
        Simulate running a benchmark.

        Args:
            params: System parameters to evaluate

        Returns:
            Dictionary with simulated metrics
        """
        # Add some randomness to make it interesting
        iterations = params.get("iterations", 2)
        questions = params.get("questions_per_iteration", 2)
        strategy = params.get("search_strategy", "standard")

        # Simulate thinking for realism
        time.sleep(0.5)

        # Calculate quality score based on parameters
        # Different benchmark types respond differently to parameters
        if self.name == "simpleqa":
            # SimpleQA likes more iterations
            quality_score = (
                self.quality_bias + (iterations * 0.04) - random.uniform(0, 0.2)
            )
            # SimpleQA is fast
            speed_score = 1.0 - (
                iterations * questions * self.speed_factor * 0.5
            )
        else:
            # BrowseComp likes more questions per iteration
            quality_score = (
                self.quality_bias + (questions * 0.05) - random.uniform(0, 0.2)
            )
            # BrowseComp is slower
            speed_score = 1.0 - (iterations * questions * self.speed_factor)

        # Strategy effects
        if strategy == "rapid":
            speed_score += 0.1
            quality_score -= 0.05
        elif strategy == "iterdrag":
            quality_score += 0.1
            speed_score -= 0.05

        # Clamp values
        quality_score = max(0.0, min(1.0, quality_score))
        speed_score = max(0.0, min(1.0, speed_score))

        return {
            "benchmark_type": self.name,
            "quality_score": quality_score,
            "speed_score": speed_score,
            "total_duration": iterations * questions * random.uniform(10, 20),
        }


class CompositeBenchmarkSimulator:
    """Simulates running multiple benchmarks with weights."""

    def __init__(self, benchmark_weights: Optional[Dict[str, float]] = None):
        """
        Initialize with benchmark weights.

        Args:
            benchmark_weights: Dictionary mapping benchmark names to weights
                Default: {"simpleqa": 1.0}
        """
        self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}

        # Create benchmark simulators
        self.simulators = {
            "simpleqa": BenchmarkSimulator(
                "simpleqa", quality_bias=0.75, speed_factor=0.15
            ),
            "browsecomp": BenchmarkSimulator(
                "browsecomp", quality_bias=0.7, speed_factor=0.25
            ),
        }

        # Normalize weights
        total_weight = sum(self.benchmark_weights.values())
        self.normalized_weights = {
            k: w / total_weight for k, w in self.benchmark_weights.items()
        }

    def evaluate(self, params: Dict[str, Any]) -> Dict[str, Any]:
        """
        Simulate running multiple benchmarks with weights.

        Args:
            params: System parameters to evaluate

        Returns:
            Dictionary with weighted results
        """
        all_results = {}
        combined_quality_score = 0.0
        combined_speed_score = 0.0
        total_duration = 0.0

        # Run each benchmark with weight > 0
        for benchmark_name, weight in self.normalized_weights.items():
            if weight > 0 and benchmark_name in self.simulators:
                simulator = self.simulators[benchmark_name]

                # Run benchmark simulation
                result = simulator.evaluate(params)

                # Store individual results
                all_results[benchmark_name] = result

                # Calculate weighted contribution
                quality_score = result["quality_score"]
                speed_score = result["speed_score"]

                weighted_quality = quality_score * weight
                weighted_speed = speed_score * weight

                logger.info(
                    f"Benchmark {benchmark_name}: quality={quality_score:.4f}, "
                    f"speed={speed_score:.4f}, weight={weight:.2f}"
                )

                # Add to combined scores
                combined_quality_score += weighted_quality
                combined_speed_score += weighted_speed
                total_duration += result["total_duration"]

        # Return combined results
        return {
            "quality_score": combined_quality_score,
            "speed_score": combined_speed_score,
            "total_duration": total_duration,
            "benchmark_results": all_results,
            "benchmark_weights": self.normalized_weights,
        }


class OptunaOptimizerSimulator:
    """Simulates Optuna optimizer for demonstration purposes."""

    def __init__(
        self,
        benchmark_weights: Optional[Dict[str, float]] = None,
        metric_weights: Optional[Dict[str, float]] = None,
    ):
        """
        Initialize optimizer simulator.

        Args:
            benchmark_weights: Weights for different benchmarks
            metric_weights: Weights for quality vs speed metrics
        """
        self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
        self.metric_weights = metric_weights or {"quality": 0.6, "speed": 0.4}
        self.benchmark_simulator = CompositeBenchmarkSimulator(
            benchmark_weights
        )

    def optimize(
        self, param_space: Dict[str, Any], n_trials: int = 10
    ) -> Tuple[Dict[str, Any], float]:
        """
        Simulate optimization process.

        Args:
            param_space: Parameter space to explore
            n_trials: Number of trials

        Returns:
            Tuple of best parameters and best score
        """
        logger.info(f"Starting optimization with {n_trials} trials")
        logger.info(f"Parameter space: {param_space}")
        logger.info(f"Benchmark weights: {self.benchmark_weights}")
        logger.info(f"Metric weights: {self.metric_weights}")

        best_score = 0.0
        best_params = {}
        all_trials = []

        # Run simulated trials
        for i in range(n_trials):
            # Generate parameters for this trial
            params = {}
            for param_name, param_config in param_space.items():
                param_type = param_config["type"]

                if param_type == "int":
                    params[param_name] = random.randint(
                        param_config["low"], param_config["high"]
                    )
                elif param_type == "categorical":
                    params[param_name] = random.choice(param_config["choices"])

            logger.info(
                f"Trial {i + 1}/{n_trials}: Testing parameters: {params}"
            )

            # Simulate benchmark evaluation
            result = self.benchmark_simulator.evaluate(params)

            # Calculate combined score based on weights
            quality_score = result["quality_score"]
            speed_score = result["speed_score"]

            combined_score = (
                self.metric_weights.get("quality", 0.6) * quality_score
                + self.metric_weights.get("speed", 0.4) * speed_score
            )

            logger.info(
                f"Trial {i + 1}: Quality: {quality_score:.4f}, Speed: {speed_score:.4f}, "
                f"Combined: {combined_score:.4f}"
            )

            # Save trial information
            trial_info = {
                "trial_number": i + 1,
                "params": params,
                "quality_score": quality_score,
                "speed_score": speed_score,
                "combined_score": combined_score,
                "benchmark_results": result["benchmark_results"],
            }
            all_trials.append(trial_info)

            # Update best parameters if this trial is better
            if combined_score > best_score:
                best_score = combined_score
                best_params = params.copy()
                logger.info(
                    f"New best parameters found: {best_params} with score: {best_score:.4f}"
                )

        # Return the best parameters
        return best_params, best_score, all_trials


def optimize_parameters(
    param_space: Optional[Dict[str, Any]] = None,
    n_trials: int = 10,
    metric_weights: Optional[Dict[str, float]] = None,
    benchmark_weights: Optional[Dict[str, float]] = None,
) -> Tuple[Dict[str, Any], float]:
    """
    Simulate parameter optimization.

    Args:
        param_space: Parameter space to explore
        n_trials: Number of trials to run
        metric_weights: Weights for quality vs speed
        benchmark_weights: Weights for different benchmarks

    Returns:
        Tuple of best parameters and best score
    """
    # Default parameter space
    if param_space is None:
        param_space = {
            "iterations": {
                "type": "int",
                "low": 1,
                "high": 5,
            },
            "questions_per_iteration": {
                "type": "int",
                "low": 1,
                "high": 5,
            },
            "search_strategy": {
                "type": "categorical",
                "choices": ["iterdrag", "standard", "rapid", "parallel"],
            },
        }

    # Create optimizer
    optimizer = OptunaOptimizerSimulator(
        benchmark_weights=benchmark_weights, metric_weights=metric_weights
    )

    # Run optimization
    return optimizer.optimize(param_space, n_trials)


def print_optimization_results(params: Dict[str, Any], score: float):
    """Print optimization results in a nicely formatted way."""
    print("\n" + "=" * 50)
    print(" OPTIMIZATION RESULTS ")
    print("=" * 50)
    print(f"SCORE: {score:.4f}")
    print("\nBest Parameters:")
    for param, value in params.items():
        print(f"  {param}: {value}")
    print("=" * 50 + "\n")


def main():
    """Run the multi-benchmark optimization simulation."""
    # Create a timestamp-based directory for results
    timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
    output_dir = "optimization_sim_" + timestamp
    Path(output_dir).mkdir(parents=True, exist_ok=True)

    print("\n🔬 Multi-Benchmark Optimization Simulation 🔬")
    print(f"Results will be saved to: {output_dir}")

    # Example 1: SimpleQA only (default)
    print("\n🔍 Running optimization with SimpleQA benchmark only...")
    params1, score1, trials1 = optimize_parameters(
        n_trials=5, benchmark_weights={"simpleqa": 1.0}
    )
    print_optimization_results(params1, score1)

    # Example 2: BrowseComp only
    print("\n🔍 Running optimization with BrowseComp benchmark only...")
    params2, score2, trials2 = optimize_parameters(
        n_trials=5, benchmark_weights={"browsecomp": 1.0}
    )
    print_optimization_results(params2, score2)

    # Example 3: 60/40 weighted combination (SimpleQA/BrowseComp)
    print("\n🔍 Running optimization with 60% SimpleQA and 40% BrowseComp...")
    params3, score3, trials3 = optimize_parameters(
        n_trials=10,
        benchmark_weights={
            "simpleqa": 0.6,  # 60% weight for SimpleQA
            "browsecomp": 0.4,  # 40% weight for BrowseComp
        },
    )
    print_optimization_results(params3, score3)

    # Save results
    results = {
        "timestamp": timestamp,
        "simpleqa_only": {
            "best_params": params1,
            "best_score": score1,
            "trials": trials1,
        },
        "browsecomp_only": {
            "best_params": params2,
            "best_score": score2,
            "trials": trials2,
        },
        "weighted_combination": {
            "best_params": params3,
            "best_score": score3,
            "trials": trials3,
            "weights": {"simpleqa": 0.6, "browsecomp": 0.4},
        },
    }

    results_file = str(Path(output_dir) / "multi_benchmark_results.json")
    with open(results_file, "w") as f:
        # Convert all values to serializable types
        json.dump(
            results,
            f,
            indent=2,
            default=lambda o: float(o) if isinstance(o, (float, int)) else o,
        )

    print(f"\n✅ Simulation complete! Results saved to {results_file}")
    print("\nComparison of best parameters:")
    print(f"- SimpleQA only:   {params1}")
    print(f"- BrowseComp only: {params2}")
    print(f"- 60/40 weighted:  {params3}")

    print("\nNote: This is a simulation for demonstration purposes only.")
    print(
        "Real optimization would run actual benchmarks to evaluate performance."
    )


if __name__ == "__main__":
    main()