Files
local-deep-research/examples/benchmarks/scripts/run_focused_benchmark_fixed.py
LearningCircuit 653707a556 fix(encoding): add encoding="utf-8" to bare open() / read_text / write_text in examples and scripts (#4118)
Cleanup follow-up to #3797. The check-open-encoding hook was originally scoped
with exclude: ^(tests/|examples/|scripts/) because those directories had ~45
pre-existing bare open() calls and addressing them was out of scope for the
core Windows bug fix.

This commit:
  * adds encoding="utf-8" to 45 read/write call sites under examples/ and
    scripts/ — JSON benchmark results, config-doc generators, workflow
    status pages, and the datetime-timezone pre-commit hook
  * narrows the hook exclude to ^tests/ only, so future regressions in
    examples/scripts/ are blocked at commit time

Windows users running the benchmark scripts and config-doc generator would
previously hit silent failures or UnicodeDecodeErrors on non-ASCII content
under cp1252. The package itself was already protected by #3797.
2026-05-18 21:45:04 +02:00

336 lines
11 KiB
Python
Executable File

#!/usr/bin/env python
"""
Focused source-based strategy evaluation with complete metrics.
This script runs a focused evaluation of the source-based strategy with
comprehensive metrics for both SimpleQA and BrowseComp benchmarks.
Updated version that properly uses the local get_llm function for grading,
accesses the database for API keys, and uses Claude Anthropic 3.7 for grading.
"""
import os
import sys
import time
from datetime import datetime, UTC
from pathlib import Path
# Set up Python path
src_dir = str((Path(__file__).parent / "src").resolve())
if src_dir not in sys.path:
sys.path.insert(0, src_dir)
# Use environment variables for configuration
# The system should be configured with proper environment variables:
# - ANTHROPIC_API_KEY for Anthropic API access
# - OPENROUTER_API_KEY for OpenRouter API access (if used)
# - LDR_DATA_DIR for data directory location (if needed)
data_dir = os.environ.get("LDR_DATA_DIR", str(Path(src_dir) / "data"))
def setup_grading_config():
"""
Create a custom evaluation configuration that uses environment variables
for API keys and specifically uses Claude Anthropic 3.7 Sonnet for grading.
Returns:
Dict containing the evaluation configuration
"""
# No need to import database utilities anymore
# Create config that uses Claude 3 Sonnet via Anthropic directly
# This will use the API key from environment variables
# Only use parameters that get_llm() accepts
evaluation_config = {
"model_name": "claude-3-sonnet-20240229", # Correct Anthropic model name
"provider": "anthropic", # Use Anthropic directly
"temperature": 0, # Zero temp for consistent evaluation
}
# Check if anthropic API key is available in environment
anthropic_key = os.environ.get("ANTHROPIC_API_KEY")
if anthropic_key:
print(
"Found Anthropic API key in environment, will use Claude 3.7 Sonnet for grading"
)
else:
print("Warning: No Anthropic API key found in environment")
print("Checking for alternative providers...")
# Try OpenRouter as a fallback
openrouter_key = os.environ.get("OPENROUTER_API_KEY")
if openrouter_key:
print(
"Found OpenRouter API key, will use OpenRouter with Claude 3.7 Sonnet"
)
evaluation_config = {
"model_name": "anthropic/claude-3-7-sonnet", # OpenRouter format
"provider": "openai_endpoint",
"openai_endpoint_url": "https://openrouter.ai/api/v1",
"temperature": 0,
}
return evaluation_config
def run_direct_evaluation(strategy="source_based", iterations=1, examples=5):
"""
Run direct evaluation of a specific strategy configuration.
Args:
strategy: Search strategy to evaluate (default: source_based)
iterations: Number of iterations for the strategy (default: 1)
examples: Number of examples to evaluate (default: 5)
"""
# Import the benchmark components
try:
from local_deep_research.benchmarks.evaluators.browsecomp import (
BrowseCompEvaluator,
)
from local_deep_research.benchmarks.evaluators.composite import (
CompositeBenchmarkEvaluator,
)
from local_deep_research.benchmarks.evaluators.simpleqa import (
SimpleQAEvaluator,
)
from local_deep_research.config.llm_config import get_llm
except ImportError as e:
print(f"Error importing benchmark components: {e}")
print("Current sys.path:", sys.path)
return
# Set up custom grading configuration
evaluation_config = setup_grading_config()
if not evaluation_config:
print(
"Failed to setup evaluation configuration, proceeding with default config"
)
# Patch the graders module to use our local get_llm
try:
# This ensures we use the local get_llm function that accesses the database
import local_deep_research.benchmarks.graders as graders
# Store the original function for reference
original_get_evaluation_llm = graders.get_evaluation_llm
# Define a new function that uses our local get_llm directly
def custom_get_evaluation_llm(custom_config=None):
"""
Override that uses the local get_llm with database access.
"""
if custom_config is None:
custom_config = evaluation_config
print(f"Getting evaluation LLM with config: {custom_config}")
return get_llm(**custom_config)
# Replace the function with our custom version
graders.get_evaluation_llm = custom_get_evaluation_llm
print(
"Successfully patched graders.get_evaluation_llm to use local get_llm function"
)
except Exception as e:
print(f"Error patching graders module: {e}")
import traceback
traceback.print_exc()
# Create timestamp for output
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
output_dir = str(Path("benchmark_results") / f"direct_eval_{timestamp}")
Path(output_dir).mkdir(parents=True, exist_ok=True)
config = {
"search_strategy": strategy,
"iterations": iterations,
# Add other fixed parameters to ensure a complete run
"questions_per_iteration": 1,
"max_results": 10,
"search_tool": "searxng", # Specify SearXNG search engine
"timeout": 10, # Very short timeout to speed up the demo
}
# Run SimpleQA benchmark
print(
f"\n=== Running SimpleQA benchmark with {strategy} strategy, {iterations} iterations ==="
)
simpleqa_start = time.time()
try:
# Create SimpleQA evaluator (without the evaluation_config parameter)
simpleqa = SimpleQAEvaluator()
# The evaluation_config will be used automatically through our patched function
# when grade_results is called inside the evaluator
simpleqa_results = simpleqa.evaluate(
config,
num_examples=examples,
output_dir=str(Path(output_dir) / "simpleqa"),
)
simpleqa_duration = time.time() - simpleqa_start
print(
f"SimpleQA evaluation complete in {simpleqa_duration:.1f} seconds"
)
print(f"SimpleQA accuracy: {simpleqa_results.get('accuracy', 0):.4f}")
print(f"SimpleQA metrics: {simpleqa_results.get('metrics', {})}")
# Save results
import json
with open(
Path(output_dir) / "simpleqa_results.json", "w", encoding="utf-8"
) as f:
json.dump(simpleqa_results, f, indent=2)
except Exception as e:
print(f"Error during SimpleQA evaluation: {e}")
import traceback
traceback.print_exc()
# Run BrowseComp benchmark
print(
f"\n=== Running BrowseComp benchmark with {strategy} strategy, {iterations} iterations ==="
)
browsecomp_start = time.time()
try:
# Create BrowseComp evaluator (without the evaluation_config parameter)
browsecomp = BrowseCompEvaluator()
# The evaluation_config will be used automatically through our patched function
# when grade_results is called inside the evaluator
browsecomp_results = browsecomp.evaluate(
config,
num_examples=examples,
output_dir=str(Path(output_dir) / "browsecomp"),
)
browsecomp_duration = time.time() - browsecomp_start
print(
f"BrowseComp evaluation complete in {browsecomp_duration:.1f} seconds"
)
print(f"BrowseComp score: {browsecomp_results.get('score', 0):.4f}")
print(f"BrowseComp metrics: {browsecomp_results.get('metrics', {})}")
# Save results
with open(
Path(output_dir) / "browsecomp_results.json", "w", encoding="utf-8"
) as f:
json.dump(browsecomp_results, f, indent=2)
except Exception as e:
print(f"Error during BrowseComp evaluation: {e}")
import traceback
traceback.print_exc()
# Run composite benchmark
print(
f"\n=== Running Composite benchmark with {strategy} strategy, {iterations} iterations ==="
)
composite_start = time.time()
try:
# Create composite evaluator with benchmark weights (without evaluation_config parameter)
benchmark_weights = {"simpleqa": 0.5, "browsecomp": 0.5}
composite = CompositeBenchmarkEvaluator(
benchmark_weights=benchmark_weights
)
composite_results = composite.evaluate(
config,
num_examples=examples,
output_dir=str(Path(output_dir) / "composite"),
)
composite_duration = time.time() - composite_start
print(
f"Composite evaluation complete in {composite_duration:.1f} seconds"
)
print(f"Composite score: {composite_results.get('score', 0):.4f}")
# Save results
with open(
Path(output_dir) / "composite_results.json", "w", encoding="utf-8"
) as f:
json.dump(composite_results, f, indent=2)
except Exception as e:
print(f"Error during composite evaluation: {e}")
import traceback
traceback.print_exc()
# Generate summary
print("\n=== Evaluation Summary ===")
print(f"Strategy: {strategy}")
print(f"Iterations: {iterations}")
print(f"Examples: {examples}")
print(f"Results saved to: {output_dir}")
# If we patched the graders module, restore the original function
if "original_get_evaluation_llm" in locals():
graders.get_evaluation_llm = original_get_evaluation_llm
print("Restored original graders.get_evaluation_llm function")
return {
"simpleqa": simpleqa_results
if "simpleqa_results" in locals()
else None,
"browsecomp": browsecomp_results
if "browsecomp_results" in locals()
else None,
"composite": composite_results
if "composite_results" in locals()
else None,
}
def main():
# Parse command line arguments
import argparse
parser = argparse.ArgumentParser(
description="Run focused strategy benchmark"
)
parser.add_argument(
"--strategy",
type=str,
default="source_based",
help="Strategy to evaluate (default: source_based)",
)
parser.add_argument(
"--iterations",
type=int,
default=1,
help="Number of iterations (default: 1)",
)
parser.add_argument(
"--examples",
type=int,
default=5,
help="Number of examples to evaluate (default: 5)",
)
args = parser.parse_args()
print(
f"Starting focused evaluation of {args.strategy} strategy with {args.iterations} iterations"
)
print(f"Evaluating with {args.examples} examples")
# Run the evaluation
results = run_direct_evaluation(
strategy=args.strategy,
iterations=args.iterations,
examples=args.examples,
)
# Return success if at least one benchmark completed
return 0 if any(results.values()) else 1
if __name__ == "__main__":
sys.exit(main())