mirror of
https://github.com/LearningCircuit/local-deep-research.git
synced 2026-06-15 19:46:56 +03:00
* feat: Add pre-commit hook to enforce pathlib usage (issue #640) - Created check-pathlib-usage.py pre-commit hook using AST parsing - Detects os.path usage and suggests pathlib alternatives - Fixed os.path.normpath usage in auth/routes.py to use PurePosixPath - Added hook configuration to .pre-commit-config.yaml The hook provides helpful suggestions for replacing os.path calls with their pathlib equivalents for better cross-platform compatibility. Co-Authored-By: djpetti <djpetti@users.noreply.github.com> * feat: Add missing pathlib pre-commit hook script Co-Authored-By: djpetti <djpetti@users.noreply.github.com> * refactor: Migrate core src modules from os.path to pathlib - Fixed web/app_factory.py, config/llm_config.py, metrics/token_counter.py - Fixed utilities/es_utils.py, web/routes/benchmark_routes.py - Fixed web/routes/settings_routes.py, web_search_engines/engines/search_engine_local.py - Replaced os.path.join() with Path() / syntax - Replaced os.path.exists() with Path().exists() - Replaced os.path.basename() with Path().name - Replaced os.path.dirname() with Path().parent Part of the migration to modern pathlib API for better cross-platform compatibility and cleaner code. Co-Authored-By: djpetti <djpetti@users.noreply.github.com> * refactor: Migrate from os.path to pathlib in src and tests (issue #640) Replaced os.path usage with pathlib.Path throughout: - src/local_deep_research/benchmarks: All os.path.join, exists, dirname, basename, abspath replaced - tests directory: Complete migration of all test files - Improved cross-platform compatibility and code readability - Kept os.path.expandvars in env_settings.py (no pathlib equivalent) Part of pre-commit hook enforcement for pathlib usage. Remaining work: examples/ and scripts/ directories. Co-Authored-By: djpetti * fix: Complete migration from os.path to pathlib.Path (issue #640) Completed manual migration of all os.path usage to pathlib.Path across: - scripts/ directory (3 files) - examples/ directory (25 files total) - examples/benchmarks/ (8 files) - examples/optimization/ (16 files) - examples/show_env_vars.py - src/local_deep_research/settings/env_settings.py Changes made: - Replaced os.path.join() with Path() / syntax - Replaced os.path.exists() with Path().exists() - Replaced os.path.dirname() with Path().parent - Replaced os.path.basename() with Path().name or Path().stem - Replaced os.path.abspath() with Path().resolve() - Replaced os.makedirs() with Path().mkdir(parents=True, exist_ok=True) - Added pathlib import where needed Note: Kept os.path.expandvars in env_settings.py as there is no pathlib equivalent. Added comment explaining this limitation. This completes the pathlib migration for issue #640. Co-Authored-By: djpetti * fix: Allow os.path.expandvars in pathlib pre-commit hook Updated the check-pathlib-usage.py pre-commit hook to skip checking os.path.expandvars since it has no pathlib equivalent. Changes: - Added exception for expandvars in both visit_Attribute and visit_Call methods - Added comment in equivalents dictionary noting expandvars is allowed - This allows env_settings.py to use os.path.expandvars without failing checks This resolves the pre-commit CI failure while maintaining the pathlib enforcement for all other os.path methods. Co-Authored-By: djpetti --------- Co-authored-by: djpetti
414 lines
13 KiB
Python
414 lines
13 KiB
Python
"""
|
|
Multi-benchmark optimization simulation.
|
|
|
|
This script demonstrates how to use multi-benchmark optimization with weighted scores
|
|
without actually running real benchmarks (just simulation).
|
|
"""
|
|
|
|
import json
|
|
import random
|
|
import time
|
|
from datetime import datetime, UTC
|
|
from pathlib import Path
|
|
from typing import Any, Dict, Optional, Tuple
|
|
|
|
from loguru import logger
|
|
|
|
|
|
class BenchmarkSimulator:
|
|
"""Simulates running benchmarks without actually executing them."""
|
|
|
|
def __init__(
|
|
self, name: str, quality_bias: float = 0.7, speed_factor: float = 0.2
|
|
):
|
|
"""
|
|
Initialize benchmark simulator.
|
|
|
|
Args:
|
|
name: Name of the benchmark
|
|
quality_bias: Base quality score (will be adjusted by parameters)
|
|
speed_factor: How much iterations affect speed (higher = more sensitive)
|
|
"""
|
|
self.name = name
|
|
self.quality_bias = quality_bias
|
|
self.speed_factor = speed_factor
|
|
|
|
def evaluate(self, params: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Simulate running a benchmark.
|
|
|
|
Args:
|
|
params: System parameters to evaluate
|
|
|
|
Returns:
|
|
Dictionary with simulated metrics
|
|
"""
|
|
# Add some randomness to make it interesting
|
|
iterations = params.get("iterations", 2)
|
|
questions = params.get("questions_per_iteration", 2)
|
|
strategy = params.get("search_strategy", "standard")
|
|
|
|
# Simulate thinking for realism
|
|
time.sleep(0.5)
|
|
|
|
# Calculate quality score based on parameters
|
|
# Different benchmark types respond differently to parameters
|
|
if self.name == "simpleqa":
|
|
# SimpleQA likes more iterations
|
|
quality_score = (
|
|
self.quality_bias + (iterations * 0.04) - random.uniform(0, 0.2)
|
|
)
|
|
# SimpleQA is fast
|
|
speed_score = 1.0 - (
|
|
iterations * questions * self.speed_factor * 0.5
|
|
)
|
|
else:
|
|
# BrowseComp likes more questions per iteration
|
|
quality_score = (
|
|
self.quality_bias + (questions * 0.05) - random.uniform(0, 0.2)
|
|
)
|
|
# BrowseComp is slower
|
|
speed_score = 1.0 - (iterations * questions * self.speed_factor)
|
|
|
|
# Strategy effects
|
|
if strategy == "rapid":
|
|
speed_score += 0.1
|
|
quality_score -= 0.05
|
|
elif strategy == "iterdrag":
|
|
quality_score += 0.1
|
|
speed_score -= 0.05
|
|
|
|
# Clamp values
|
|
quality_score = max(0.0, min(1.0, quality_score))
|
|
speed_score = max(0.0, min(1.0, speed_score))
|
|
|
|
return {
|
|
"benchmark_type": self.name,
|
|
"quality_score": quality_score,
|
|
"speed_score": speed_score,
|
|
"total_duration": iterations * questions * random.uniform(10, 20),
|
|
}
|
|
|
|
|
|
class CompositeBenchmarkSimulator:
|
|
"""Simulates running multiple benchmarks with weights."""
|
|
|
|
def __init__(self, benchmark_weights: Optional[Dict[str, float]] = None):
|
|
"""
|
|
Initialize with benchmark weights.
|
|
|
|
Args:
|
|
benchmark_weights: Dictionary mapping benchmark names to weights
|
|
Default: {"simpleqa": 1.0}
|
|
"""
|
|
self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
|
|
|
|
# Create benchmark simulators
|
|
self.simulators = {
|
|
"simpleqa": BenchmarkSimulator(
|
|
"simpleqa", quality_bias=0.75, speed_factor=0.15
|
|
),
|
|
"browsecomp": BenchmarkSimulator(
|
|
"browsecomp", quality_bias=0.7, speed_factor=0.25
|
|
),
|
|
}
|
|
|
|
# Normalize weights
|
|
total_weight = sum(self.benchmark_weights.values())
|
|
self.normalized_weights = {
|
|
k: w / total_weight for k, w in self.benchmark_weights.items()
|
|
}
|
|
|
|
def evaluate(self, params: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Simulate running multiple benchmarks with weights.
|
|
|
|
Args:
|
|
params: System parameters to evaluate
|
|
|
|
Returns:
|
|
Dictionary with weighted results
|
|
"""
|
|
all_results = {}
|
|
combined_quality_score = 0.0
|
|
combined_speed_score = 0.0
|
|
total_duration = 0.0
|
|
|
|
# Run each benchmark with weight > 0
|
|
for benchmark_name, weight in self.normalized_weights.items():
|
|
if weight > 0 and benchmark_name in self.simulators:
|
|
simulator = self.simulators[benchmark_name]
|
|
|
|
# Run benchmark simulation
|
|
result = simulator.evaluate(params)
|
|
|
|
# Store individual results
|
|
all_results[benchmark_name] = result
|
|
|
|
# Calculate weighted contribution
|
|
quality_score = result["quality_score"]
|
|
speed_score = result["speed_score"]
|
|
|
|
weighted_quality = quality_score * weight
|
|
weighted_speed = speed_score * weight
|
|
|
|
logger.info(
|
|
f"Benchmark {benchmark_name}: quality={quality_score:.4f}, "
|
|
f"speed={speed_score:.4f}, weight={weight:.2f}"
|
|
)
|
|
|
|
# Add to combined scores
|
|
combined_quality_score += weighted_quality
|
|
combined_speed_score += weighted_speed
|
|
total_duration += result["total_duration"]
|
|
|
|
# Return combined results
|
|
return {
|
|
"quality_score": combined_quality_score,
|
|
"speed_score": combined_speed_score,
|
|
"total_duration": total_duration,
|
|
"benchmark_results": all_results,
|
|
"benchmark_weights": self.normalized_weights,
|
|
}
|
|
|
|
|
|
class OptunaOptimizerSimulator:
|
|
"""Simulates Optuna optimizer for demonstration purposes."""
|
|
|
|
def __init__(
|
|
self,
|
|
benchmark_weights: Optional[Dict[str, float]] = None,
|
|
metric_weights: Optional[Dict[str, float]] = None,
|
|
):
|
|
"""
|
|
Initialize optimizer simulator.
|
|
|
|
Args:
|
|
benchmark_weights: Weights for different benchmarks
|
|
metric_weights: Weights for quality vs speed metrics
|
|
"""
|
|
self.benchmark_weights = benchmark_weights or {"simpleqa": 1.0}
|
|
self.metric_weights = metric_weights or {"quality": 0.6, "speed": 0.4}
|
|
self.benchmark_simulator = CompositeBenchmarkSimulator(
|
|
benchmark_weights
|
|
)
|
|
|
|
def optimize(
|
|
self, param_space: Dict[str, Any], n_trials: int = 10
|
|
) -> Tuple[Dict[str, Any], float]:
|
|
"""
|
|
Simulate optimization process.
|
|
|
|
Args:
|
|
param_space: Parameter space to explore
|
|
n_trials: Number of trials
|
|
|
|
Returns:
|
|
Tuple of best parameters and best score
|
|
"""
|
|
logger.info(f"Starting optimization with {n_trials} trials")
|
|
logger.info(f"Parameter space: {param_space}")
|
|
logger.info(f"Benchmark weights: {self.benchmark_weights}")
|
|
logger.info(f"Metric weights: {self.metric_weights}")
|
|
|
|
best_score = 0.0
|
|
best_params = {}
|
|
all_trials = []
|
|
|
|
# Run simulated trials
|
|
for i in range(n_trials):
|
|
# Generate parameters for this trial
|
|
params = {}
|
|
for param_name, param_config in param_space.items():
|
|
param_type = param_config["type"]
|
|
|
|
if param_type == "int":
|
|
params[param_name] = random.randint(
|
|
param_config["low"], param_config["high"]
|
|
)
|
|
elif param_type == "categorical":
|
|
params[param_name] = random.choice(param_config["choices"])
|
|
|
|
logger.info(
|
|
f"Trial {i + 1}/{n_trials}: Testing parameters: {params}"
|
|
)
|
|
|
|
# Simulate benchmark evaluation
|
|
result = self.benchmark_simulator.evaluate(params)
|
|
|
|
# Calculate combined score based on weights
|
|
quality_score = result["quality_score"]
|
|
speed_score = result["speed_score"]
|
|
|
|
combined_score = (
|
|
self.metric_weights.get("quality", 0.6) * quality_score
|
|
+ self.metric_weights.get("speed", 0.4) * speed_score
|
|
)
|
|
|
|
logger.info(
|
|
f"Trial {i + 1}: Quality: {quality_score:.4f}, Speed: {speed_score:.4f}, "
|
|
f"Combined: {combined_score:.4f}"
|
|
)
|
|
|
|
# Save trial information
|
|
trial_info = {
|
|
"trial_number": i + 1,
|
|
"params": params,
|
|
"quality_score": quality_score,
|
|
"speed_score": speed_score,
|
|
"combined_score": combined_score,
|
|
"benchmark_results": result["benchmark_results"],
|
|
}
|
|
all_trials.append(trial_info)
|
|
|
|
# Update best parameters if this trial is better
|
|
if combined_score > best_score:
|
|
best_score = combined_score
|
|
best_params = params.copy()
|
|
logger.info(
|
|
f"New best parameters found: {best_params} with score: {best_score:.4f}"
|
|
)
|
|
|
|
# Return the best parameters
|
|
return best_params, best_score, all_trials
|
|
|
|
|
|
def optimize_parameters(
|
|
param_space: Optional[Dict[str, Any]] = None,
|
|
n_trials: int = 10,
|
|
metric_weights: Optional[Dict[str, float]] = None,
|
|
benchmark_weights: Optional[Dict[str, float]] = None,
|
|
) -> Tuple[Dict[str, Any], float]:
|
|
"""
|
|
Simulate parameter optimization.
|
|
|
|
Args:
|
|
param_space: Parameter space to explore
|
|
n_trials: Number of trials to run
|
|
metric_weights: Weights for quality vs speed
|
|
benchmark_weights: Weights for different benchmarks
|
|
|
|
Returns:
|
|
Tuple of best parameters and best score
|
|
"""
|
|
# Default parameter space
|
|
if param_space is None:
|
|
param_space = {
|
|
"iterations": {
|
|
"type": "int",
|
|
"low": 1,
|
|
"high": 5,
|
|
},
|
|
"questions_per_iteration": {
|
|
"type": "int",
|
|
"low": 1,
|
|
"high": 5,
|
|
},
|
|
"search_strategy": {
|
|
"type": "categorical",
|
|
"choices": ["iterdrag", "standard", "rapid", "parallel"],
|
|
},
|
|
}
|
|
|
|
# Create optimizer
|
|
optimizer = OptunaOptimizerSimulator(
|
|
benchmark_weights=benchmark_weights, metric_weights=metric_weights
|
|
)
|
|
|
|
# Run optimization
|
|
return optimizer.optimize(param_space, n_trials)
|
|
|
|
|
|
def print_optimization_results(params: Dict[str, Any], score: float):
|
|
"""Print optimization results in a nicely formatted way."""
|
|
print("\n" + "=" * 50)
|
|
print(" OPTIMIZATION RESULTS ")
|
|
print("=" * 50)
|
|
print(f"SCORE: {score:.4f}")
|
|
print("\nBest Parameters:")
|
|
for param, value in params.items():
|
|
print(f" {param}: {value}")
|
|
print("=" * 50 + "\n")
|
|
|
|
|
|
def main():
|
|
"""Run the multi-benchmark optimization simulation."""
|
|
# Create a timestamp-based directory for results
|
|
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
|
|
output_dir = "optimization_sim_" + timestamp
|
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
|
|
print("\n🔬 Multi-Benchmark Optimization Simulation 🔬")
|
|
print(f"Results will be saved to: {output_dir}")
|
|
|
|
# Example 1: SimpleQA only (default)
|
|
print("\n🔍 Running optimization with SimpleQA benchmark only...")
|
|
params1, score1, trials1 = optimize_parameters(
|
|
n_trials=5, benchmark_weights={"simpleqa": 1.0}
|
|
)
|
|
print_optimization_results(params1, score1)
|
|
|
|
# Example 2: BrowseComp only
|
|
print("\n🔍 Running optimization with BrowseComp benchmark only...")
|
|
params2, score2, trials2 = optimize_parameters(
|
|
n_trials=5, benchmark_weights={"browsecomp": 1.0}
|
|
)
|
|
print_optimization_results(params2, score2)
|
|
|
|
# Example 3: 60/40 weighted combination (SimpleQA/BrowseComp)
|
|
print("\n🔍 Running optimization with 60% SimpleQA and 40% BrowseComp...")
|
|
params3, score3, trials3 = optimize_parameters(
|
|
n_trials=10,
|
|
benchmark_weights={
|
|
"simpleqa": 0.6, # 60% weight for SimpleQA
|
|
"browsecomp": 0.4, # 40% weight for BrowseComp
|
|
},
|
|
)
|
|
print_optimization_results(params3, score3)
|
|
|
|
# Save results
|
|
results = {
|
|
"timestamp": timestamp,
|
|
"simpleqa_only": {
|
|
"best_params": params1,
|
|
"best_score": score1,
|
|
"trials": trials1,
|
|
},
|
|
"browsecomp_only": {
|
|
"best_params": params2,
|
|
"best_score": score2,
|
|
"trials": trials2,
|
|
},
|
|
"weighted_combination": {
|
|
"best_params": params3,
|
|
"best_score": score3,
|
|
"trials": trials3,
|
|
"weights": {"simpleqa": 0.6, "browsecomp": 0.4},
|
|
},
|
|
}
|
|
|
|
results_file = str(Path(output_dir) / "multi_benchmark_results.json")
|
|
with open(results_file, "w") as f:
|
|
# Convert all values to serializable types
|
|
json.dump(
|
|
results,
|
|
f,
|
|
indent=2,
|
|
default=lambda o: float(o) if isinstance(o, (float, int)) else o,
|
|
)
|
|
|
|
print(f"\n✅ Simulation complete! Results saved to {results_file}")
|
|
print("\nComparison of best parameters:")
|
|
print(f"- SimpleQA only: {params1}")
|
|
print(f"- BrowseComp only: {params2}")
|
|
print(f"- 60/40 weighted: {params3}")
|
|
|
|
print("\nNote: This is a simulation for demonstration purposes only.")
|
|
print(
|
|
"Real optimization would run actual benchmarks to evaluate performance."
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|