Files
local-deep-research/examples/run_benchmark.py
LearningCircuit af7e279db5 fix: Major pre-commit hook compliance improvements
- Replace logger.error() with logger.exception() in exception handlers across multiple files
- Convert standard logging imports to loguru in test and example files
- Add "examples/" directory to allowed patterns for environment variable access
- Add "sqlcipher_utils.py" to allowed patterns for database encryption setup
- Add exceptions for app_factory.py and log_utils.py which legitimately need standard logging
- Fix all custom code quality checks and environment variable access violations

All pre-commit hooks now pass successfully on all files.
2025-07-16 19:08:41 +02:00

195 lines
6.2 KiB
Python

#!/usr/bin/env python
"""
Example script for running benchmarks using the Local Deep Research benchmarking framework.
This script demonstrates how to run SimpleQA and BrowseComp benchmarks programmatically.
"""
import argparse
import os
from local_deep_research.api.benchmark_functions import (
compare_configurations,
evaluate_browsecomp,
evaluate_simpleqa,
)
def main():
"""Run benchmark examples."""
parser = argparse.ArgumentParser(description="LDR Benchmark Examples")
parser.add_argument(
"--benchmark",
choices=["simpleqa", "browsecomp", "compare"],
default="simpleqa",
help="Benchmark to run",
)
parser.add_argument(
"--examples", type=int, default=10, help="Number of examples to use"
)
parser.add_argument(
"--iterations", type=int, default=3, help="Number of search iterations"
)
parser.add_argument(
"--questions", type=int, default=3, help="Questions per iteration"
)
parser.add_argument(
"--search-tool", default="searxng", help="Search tool to use"
)
parser.add_argument(
"--human-eval", action="store_true", help="Use human evaluation"
)
parser.add_argument(
"--output-dir",
default="benchmark_results",
help="Directory to save results",
)
args = parser.parse_args()
# Create output directory if it doesn't exist
os.makedirs(args.output_dir, exist_ok=True)
print(f"Running {args.benchmark} benchmark with {args.examples} examples")
# Run the specified benchmark
if args.benchmark == "simpleqa":
run_simpleqa_example(args)
elif args.benchmark == "browsecomp":
run_browsecomp_example(args)
elif args.benchmark == "compare":
run_comparison_example(args)
else:
print(f"Unknown benchmark: {args.benchmark}")
def run_simpleqa_example(args):
"""Run SimpleQA benchmark."""
print("\n=== SimpleQA Benchmark ===")
print(f"Running with {args.examples} examples")
print(f"Search iterations: {args.iterations}")
print(f"Questions per iteration: {args.questions}")
print(f"Search tool: {args.search_tool}")
print(f"Human evaluation: {args.human_eval}")
print(f"Output directory: {args.output_dir}")
print("=" * 30)
# Run benchmark
result = evaluate_simpleqa(
num_examples=args.examples,
search_iterations=args.iterations,
questions_per_iteration=args.questions,
search_tool=args.search_tool,
human_evaluation=args.human_eval,
output_dir=args.output_dir,
)
# Print results
if "metrics" in result:
print("\nResults:")
print(f" Accuracy: {result['metrics'].get('accuracy', 0):.3f}")
print(f" Total examples: {result['total_examples']}")
print(f" Correct answers: {result['metrics'].get('correct', 0)}")
print(
f" Average time: {result['metrics'].get('average_processing_time', 0):.2f}s"
)
print(f"\nReport saved to: {result.get('report_path', 'N/A')}")
else:
print("\nBenchmark completed without evaluation")
print(f" Results saved to: {result.get('results_path', 'N/A')}")
def run_browsecomp_example(args):
"""Run BrowseComp benchmark."""
print("\n=== BrowseComp Benchmark ===")
print(f"Running with {args.examples} examples")
print(f"Search iterations: {args.iterations}")
print(f"Questions per iteration: {args.questions}")
print(f"Search tool: {args.search_tool}")
print(f"Human evaluation: {args.human_eval}")
print(f"Output directory: {args.output_dir}")
print("=" * 30)
# Run benchmark
result = evaluate_browsecomp(
num_examples=args.examples,
search_iterations=args.iterations,
questions_per_iteration=args.questions,
search_tool=args.search_tool,
human_evaluation=args.human_eval,
output_dir=args.output_dir,
)
# Print results
if "metrics" in result:
print("\nResults:")
print(f" Accuracy: {result['metrics'].get('accuracy', 0):.3f}")
print(f" Total examples: {result['total_examples']}")
print(f" Correct answers: {result['metrics'].get('correct', 0)}")
print(
f" Average time: {result['metrics'].get('average_processing_time', 0):.2f}s"
)
print(f"\nReport saved to: {result.get('report_path', 'N/A')}")
else:
print("\nBenchmark completed without evaluation")
print(f" Results saved to: {result.get('results_path', 'N/A')}")
def run_comparison_example(args):
"""Run configuration comparison."""
print("\n=== Configuration Comparison ===")
print(f"Dataset: {args.benchmark}")
print(f"Examples per configuration: {args.examples}")
print(f"Output directory: {args.output_dir}")
print("=" * 30)
# Define configurations to compare
configurations = [
{
"name": "Base Config",
"search_tool": args.search_tool,
"iterations": 1,
"questions_per_iteration": 3,
},
{
"name": "More Iterations",
"search_tool": args.search_tool,
"iterations": 3,
"questions_per_iteration": 3,
},
{
"name": "More Questions",
"search_tool": args.search_tool,
"iterations": 1,
"questions_per_iteration": 5,
},
]
# Run comparison
result = compare_configurations(
dataset_type="simpleqa", # Use SimpleQA for faster comparison
num_examples=args.examples,
configurations=configurations,
output_dir=args.output_dir,
)
# Print results
print("\nComparison Results:")
print(f" Configurations tested: {result['configurations_tested']}")
print(f" Report saved to: {result['report_path']}")
# Print brief comparison table
print("\nResults Summary:")
print("Configuration | Accuracy | Avg. Time")
print("--------------- | -------- | ---------")
for res in result["results"]:
name = res["configuration_name"]
acc = res.get("metrics", {}).get("accuracy", 0)
time = res.get("metrics", {}).get("average_processing_time", 0)
print(f"{name:15} | {acc:.3f} | {time:.2f}s")
if __name__ == "__main__":
main()