Files
local-deep-research/examples/optimization/multi_benchmark_simulation.py
LearningCircuit 0c6635ecc2 feat: Add pre-commit hook to enforce pathlib usage (issue #640) (#656)
* feat: Add pre-commit hook to enforce pathlib usage (issue #640)

- Created check-pathlib-usage.py pre-commit hook using AST parsing
- Detects os.path usage and suggests pathlib alternatives
- Fixed os.path.normpath usage in auth/routes.py to use PurePosixPath
- Added hook configuration to .pre-commit-config.yaml

The hook provides helpful suggestions for replacing os.path calls with
their pathlib equivalents for better cross-platform compatibility.

Co-Authored-By: djpetti <djpetti@users.noreply.github.com>

* feat: Add missing pathlib pre-commit hook script

Co-Authored-By: djpetti <djpetti@users.noreply.github.com>

* refactor: Migrate core src modules from os.path to pathlib

- Fixed web/app_factory.py, config/llm_config.py, metrics/token_counter.py
- Fixed utilities/es_utils.py, web/routes/benchmark_routes.py
- Fixed web/routes/settings_routes.py, web_search_engines/engines/search_engine_local.py
- Replaced os.path.join() with Path() / syntax
- Replaced os.path.exists() with Path().exists()
- Replaced os.path.basename() with Path().name
- Replaced os.path.dirname() with Path().parent

Part of the migration to modern pathlib API for better cross-platform
compatibility and cleaner code.

Co-Authored-By: djpetti <djpetti@users.noreply.github.com>

* refactor: Migrate from os.path to pathlib in src and tests (issue #640)

Replaced os.path usage with pathlib.Path throughout:
- src/local_deep_research/benchmarks: All os.path.join, exists, dirname, basename, abspath replaced
- tests directory: Complete migration of all test files
- Improved cross-platform compatibility and code readability
- Kept os.path.expandvars in env_settings.py (no pathlib equivalent)

Part of pre-commit hook enforcement for pathlib usage.
Remaining work: examples/ and scripts/ directories.

Co-Authored-By: djpetti

* fix: Complete migration from os.path to pathlib.Path (issue #640)

Completed manual migration of all os.path usage to pathlib.Path across:
- scripts/ directory (3 files)
- examples/ directory (25 files total)
  - examples/benchmarks/ (8 files)
  - examples/optimization/ (16 files)
  - examples/show_env_vars.py
- src/local_deep_research/settings/env_settings.py

Changes made:
- Replaced os.path.join() with Path() / syntax
- Replaced os.path.exists() with Path().exists()
- Replaced os.path.dirname() with Path().parent
- Replaced os.path.basename() with Path().name or Path().stem
- Replaced os.path.abspath() with Path().resolve()
- Replaced os.makedirs() with Path().mkdir(parents=True, exist_ok=True)
- Added pathlib import where needed

Note: Kept os.path.expandvars in env_settings.py as there is no pathlib
equivalent. Added comment explaining this limitation.

This completes the pathlib migration for issue #640.

Co-Authored-By: djpetti

* fix: Allow os.path.expandvars in pathlib pre-commit hook

Updated the check-pathlib-usage.py pre-commit hook to skip checking
os.path.expandvars since it has no pathlib equivalent.

Changes:
- Added exception for expandvars in both visit_Attribute and visit_Call methods
- Added comment in equivalents dictionary noting expandvars is allowed
- This allows env_settings.py to use os.path.expandvars without failing checks

This resolves the pre-commit CI failure while maintaining the pathlib
enforcement for all other os.path methods.

Co-Authored-By: djpetti

---------

Co-authored-by: djpetti
2025-08-17 22:52:35 +02:00

414 lines
13 KiB
Python

"""
Multi-benchmark optimization simulation.
This script demonstrates how to use multi-benchmark optimization with weighted scores
without actually running real benchmarks (just simulation).
"""
import json
import random
import time
from datetime import datetime, UTC
from pathlib import Path
from typing import Any, Dict, Optional, Tuple
from loguru import logger
class BenchmarkSimulator:
"""Simulates running benchmarks without actually executing them."""
def __init__(
self, name: str, quality_bias: float = 0.7, speed_factor: float = 0.2
):
"""
Initialize benchmark simulator.
Args:
name: Name of the benchmark
quality_bias: Base quality score (will be adjusted by parameters)
speed_factor: How much iterations affect speed (higher = more sensitive)
"""
self.name = name
self.quality_bias = quality_bias
self.speed_factor = speed_factor
def evaluate(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""
Simulate running a benchmark.
Args:
params: System parameters to evaluate
Returns:
Dictionary with simulated metrics
"""
# Add some randomness to make it interesting
iterations = params.get("iterations", 2)
questions = params.get("questions_per_iteration", 2)
strategy = params.get("search_strategy", "standard")
# Simulate thinking for realism
time.sleep(0.5)
# Calculate quality score based on parameters
# Different benchmark types respond differently to parameters
if self.name == "simpleqa":
# SimpleQA likes more iterations
quality_score = (
self.quality_bias + (iterations * 0.04) - random.uniform(0, 0.2)
)
# SimpleQA is fast
speed_score = 1.0 - (
iterations * questions * self.speed_factor * 0.5
)
else:
# BrowseComp likes more questions per iteration
quality_score = (
self.quality_bias + (questions * 0.05) - random.uniform(0, 0.2)
)
# BrowseComp is slower
speed_score = 1.0 - (iterations * questions * self.speed_factor)
# Strategy effects
if strategy == "rapid":
speed_score += 0.1
quality_score -= 0.05
elif strategy == "iterdrag":
quality_score += 0.1
speed_score -= 0.05
# Clamp values
quality_score = max(0.0, min(1.0, quality_score))
speed_score = max(0.0, min(1.0, speed_score))
return {
"benchmark_type": self.name,
"quality_score": quality_score,
"speed_score": speed_score,
"total_duration": iterations * questions * random.uniform(10, 20),
}
class CompositeBenchmarkSimulator:
"""Simulates running multiple benchmarks with weights."""
def __init__(self, benchmark_weights: Optional[Dict[str, float]] = None):
"""
Initialize with benchmark weights.
Args:
benchmark_weights: Dictionary mapping benchmark names to weights
Default: {"simpleqa": 1.0}
"""
self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
# Create benchmark simulators
self.simulators = {
"simpleqa": BenchmarkSimulator(
"simpleqa", quality_bias=0.75, speed_factor=0.15
),
"browsecomp": BenchmarkSimulator(
"browsecomp", quality_bias=0.7, speed_factor=0.25
),
}
# Normalize weights
total_weight = sum(self.benchmark_weights.values())
self.normalized_weights = {
k: w / total_weight for k, w in self.benchmark_weights.items()
}
def evaluate(self, params: Dict[str, Any]) -> Dict[str, Any]:
"""
Simulate running multiple benchmarks with weights.
Args:
params: System parameters to evaluate
Returns:
Dictionary with weighted results
"""
all_results = {}
combined_quality_score = 0.0
combined_speed_score = 0.0
total_duration = 0.0
# Run each benchmark with weight > 0
for benchmark_name, weight in self.normalized_weights.items():
if weight > 0 and benchmark_name in self.simulators:
simulator = self.simulators[benchmark_name]
# Run benchmark simulation
result = simulator.evaluate(params)
# Store individual results
all_results[benchmark_name] = result
# Calculate weighted contribution
quality_score = result["quality_score"]
speed_score = result["speed_score"]
weighted_quality = quality_score * weight
weighted_speed = speed_score * weight
logger.info(
f"Benchmark {benchmark_name}: quality={quality_score:.4f}, "
f"speed={speed_score:.4f}, weight={weight:.2f}"
)
# Add to combined scores
combined_quality_score += weighted_quality
combined_speed_score += weighted_speed
total_duration += result["total_duration"]
# Return combined results
return {
"quality_score": combined_quality_score,
"speed_score": combined_speed_score,
"total_duration": total_duration,
"benchmark_results": all_results,
"benchmark_weights": self.normalized_weights,
}
class OptunaOptimizerSimulator:
"""Simulates Optuna optimizer for demonstration purposes."""
def __init__(
self,
benchmark_weights: Optional[Dict[str, float]] = None,
metric_weights: Optional[Dict[str, float]] = None,
):
"""
Initialize optimizer simulator.
Args:
benchmark_weights: Weights for different benchmarks
metric_weights: Weights for quality vs speed metrics
"""
self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
self.metric_weights = metric_weights or {"quality": 0.6, "speed": 0.4}
self.benchmark_simulator = CompositeBenchmarkSimulator(
benchmark_weights
)
def optimize(
self, param_space: Dict[str, Any], n_trials: int = 10
) -> Tuple[Dict[str, Any], float]:
"""
Simulate optimization process.
Args:
param_space: Parameter space to explore
n_trials: Number of trials
Returns:
Tuple of best parameters and best score
"""
logger.info(f"Starting optimization with {n_trials} trials")
logger.info(f"Parameter space: {param_space}")
logger.info(f"Benchmark weights: {self.benchmark_weights}")
logger.info(f"Metric weights: {self.metric_weights}")
best_score = 0.0
best_params = {}
all_trials = []
# Run simulated trials
for i in range(n_trials):
# Generate parameters for this trial
params = {}
for param_name, param_config in param_space.items():
param_type = param_config["type"]
if param_type == "int":
params[param_name] = random.randint(
param_config["low"], param_config["high"]
)
elif param_type == "categorical":
params[param_name] = random.choice(param_config["choices"])
logger.info(
f"Trial {i + 1}/{n_trials}: Testing parameters: {params}"
)
# Simulate benchmark evaluation
result = self.benchmark_simulator.evaluate(params)
# Calculate combined score based on weights
quality_score = result["quality_score"]
speed_score = result["speed_score"]
combined_score = (
self.metric_weights.get("quality", 0.6) * quality_score
+ self.metric_weights.get("speed", 0.4) * speed_score
)
logger.info(
f"Trial {i + 1}: Quality: {quality_score:.4f}, Speed: {speed_score:.4f}, "
f"Combined: {combined_score:.4f}"
)
# Save trial information
trial_info = {
"trial_number": i + 1,
"params": params,
"quality_score": quality_score,
"speed_score": speed_score,
"combined_score": combined_score,
"benchmark_results": result["benchmark_results"],
}
all_trials.append(trial_info)
# Update best parameters if this trial is better
if combined_score > best_score:
best_score = combined_score
best_params = params.copy()
logger.info(
f"New best parameters found: {best_params} with score: {best_score:.4f}"
)
# Return the best parameters
return best_params, best_score, all_trials
def optimize_parameters(
param_space: Optional[Dict[str, Any]] = None,
n_trials: int = 10,
metric_weights: Optional[Dict[str, float]] = None,
benchmark_weights: Optional[Dict[str, float]] = None,
) -> Tuple[Dict[str, Any], float]:
"""
Simulate parameter optimization.
Args:
param_space: Parameter space to explore
n_trials: Number of trials to run
metric_weights: Weights for quality vs speed
benchmark_weights: Weights for different benchmarks
Returns:
Tuple of best parameters and best score
"""
# Default parameter space
if param_space is None:
param_space = {
"iterations": {
"type": "int",
"low": 1,
"high": 5,
},
"questions_per_iteration": {
"type": "int",
"low": 1,
"high": 5,
},
"search_strategy": {
"type": "categorical",
"choices": ["iterdrag", "standard", "rapid", "parallel"],
},
}
# Create optimizer
optimizer = OptunaOptimizerSimulator(
benchmark_weights=benchmark_weights, metric_weights=metric_weights
)
# Run optimization
return optimizer.optimize(param_space, n_trials)
def print_optimization_results(params: Dict[str, Any], score: float):
"""Print optimization results in a nicely formatted way."""
print("\n" + "=" * 50)
print(" OPTIMIZATION RESULTS ")
print("=" * 50)
print(f"SCORE: {score:.4f}")
print("\nBest Parameters:")
for param, value in params.items():
print(f" {param}: {value}")
print("=" * 50 + "\n")
def main():
"""Run the multi-benchmark optimization simulation."""
# Create a timestamp-based directory for results
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
output_dir = "optimization_sim_" + timestamp
Path(output_dir).mkdir(parents=True, exist_ok=True)
print("\n🔬 Multi-Benchmark Optimization Simulation 🔬")
print(f"Results will be saved to: {output_dir}")
# Example 1: SimpleQA only (default)
print("\n🔍 Running optimization with SimpleQA benchmark only...")
params1, score1, trials1 = optimize_parameters(
n_trials=5, benchmark_weights={"simpleqa": 1.0}
)
print_optimization_results(params1, score1)
# Example 2: BrowseComp only
print("\n🔍 Running optimization with BrowseComp benchmark only...")
params2, score2, trials2 = optimize_parameters(
n_trials=5, benchmark_weights={"browsecomp": 1.0}
)
print_optimization_results(params2, score2)
# Example 3: 60/40 weighted combination (SimpleQA/BrowseComp)
print("\n🔍 Running optimization with 60% SimpleQA and 40% BrowseComp...")
params3, score3, trials3 = optimize_parameters(
n_trials=10,
benchmark_weights={
"simpleqa": 0.6, # 60% weight for SimpleQA
"browsecomp": 0.4, # 40% weight for BrowseComp
},
)
print_optimization_results(params3, score3)
# Save results
results = {
"timestamp": timestamp,
"simpleqa_only": {
"best_params": params1,
"best_score": score1,
"trials": trials1,
},
"browsecomp_only": {
"best_params": params2,
"best_score": score2,
"trials": trials2,
},
"weighted_combination": {
"best_params": params3,
"best_score": score3,
"trials": trials3,
"weights": {"simpleqa": 0.6, "browsecomp": 0.4},
},
}
results_file = str(Path(output_dir) / "multi_benchmark_results.json")
with open(results_file, "w") as f:
# Convert all values to serializable types
json.dump(
results,
f,
indent=2,
default=lambda o: float(o) if isinstance(o, (float, int)) else o,
)
print(f"\n✅ Simulation complete! Results saved to {results_file}")
print("\nComparison of best parameters:")
print(f"- SimpleQA only: {params1}")
print(f"- BrowseComp only: {params2}")
print(f"- 60/40 weighted: {params3}")
print("\nNote: This is a simulation for demonstration purposes only.")
print(
"Real optimization would run actual benchmarks to evaluate performance."
)
if __name__ == "__main__":
main()