mirror of
https://github.com/LearningCircuit/local-deep-research.git
synced 2026-06-15 19:46:56 +03:00
- Replace logger.error() with logger.exception() in exception handlers across multiple files - Convert standard logging imports to loguru in test and example files - Add "examples/" directory to allowed patterns for environment variable access - Add "sqlcipher_utils.py" to allowed patterns for database encryption setup - Add exceptions for app_factory.py and log_utils.py which legitimately need standard logging - Fix all custom code quality checks and environment variable access violations All pre-commit hooks now pass successfully on all files.
195 lines
6.2 KiB
Python
195 lines
6.2 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
Example script for running benchmarks using the Local Deep Research benchmarking framework.
|
|
|
|
This script demonstrates how to run SimpleQA and BrowseComp benchmarks programmatically.
|
|
"""
|
|
|
|
import argparse
|
|
import os
|
|
|
|
|
|
from local_deep_research.api.benchmark_functions import (
|
|
compare_configurations,
|
|
evaluate_browsecomp,
|
|
evaluate_simpleqa,
|
|
)
|
|
|
|
|
|
def main():
|
|
"""Run benchmark examples."""
|
|
parser = argparse.ArgumentParser(description="LDR Benchmark Examples")
|
|
parser.add_argument(
|
|
"--benchmark",
|
|
choices=["simpleqa", "browsecomp", "compare"],
|
|
default="simpleqa",
|
|
help="Benchmark to run",
|
|
)
|
|
parser.add_argument(
|
|
"--examples", type=int, default=10, help="Number of examples to use"
|
|
)
|
|
parser.add_argument(
|
|
"--iterations", type=int, default=3, help="Number of search iterations"
|
|
)
|
|
parser.add_argument(
|
|
"--questions", type=int, default=3, help="Questions per iteration"
|
|
)
|
|
parser.add_argument(
|
|
"--search-tool", default="searxng", help="Search tool to use"
|
|
)
|
|
parser.add_argument(
|
|
"--human-eval", action="store_true", help="Use human evaluation"
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
default="benchmark_results",
|
|
help="Directory to save results",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Create output directory if it doesn't exist
|
|
os.makedirs(args.output_dir, exist_ok=True)
|
|
|
|
print(f"Running {args.benchmark} benchmark with {args.examples} examples")
|
|
|
|
# Run the specified benchmark
|
|
if args.benchmark == "simpleqa":
|
|
run_simpleqa_example(args)
|
|
elif args.benchmark == "browsecomp":
|
|
run_browsecomp_example(args)
|
|
elif args.benchmark == "compare":
|
|
run_comparison_example(args)
|
|
else:
|
|
print(f"Unknown benchmark: {args.benchmark}")
|
|
|
|
|
|
def run_simpleqa_example(args):
|
|
"""Run SimpleQA benchmark."""
|
|
print("\n=== SimpleQA Benchmark ===")
|
|
print(f"Running with {args.examples} examples")
|
|
print(f"Search iterations: {args.iterations}")
|
|
print(f"Questions per iteration: {args.questions}")
|
|
print(f"Search tool: {args.search_tool}")
|
|
print(f"Human evaluation: {args.human_eval}")
|
|
print(f"Output directory: {args.output_dir}")
|
|
print("=" * 30)
|
|
|
|
# Run benchmark
|
|
result = evaluate_simpleqa(
|
|
num_examples=args.examples,
|
|
search_iterations=args.iterations,
|
|
questions_per_iteration=args.questions,
|
|
search_tool=args.search_tool,
|
|
human_evaluation=args.human_eval,
|
|
output_dir=args.output_dir,
|
|
)
|
|
|
|
# Print results
|
|
if "metrics" in result:
|
|
print("\nResults:")
|
|
print(f" Accuracy: {result['metrics'].get('accuracy', 0):.3f}")
|
|
print(f" Total examples: {result['total_examples']}")
|
|
print(f" Correct answers: {result['metrics'].get('correct', 0)}")
|
|
print(
|
|
f" Average time: {result['metrics'].get('average_processing_time', 0):.2f}s"
|
|
)
|
|
print(f"\nReport saved to: {result.get('report_path', 'N/A')}")
|
|
else:
|
|
print("\nBenchmark completed without evaluation")
|
|
print(f" Results saved to: {result.get('results_path', 'N/A')}")
|
|
|
|
|
|
def run_browsecomp_example(args):
|
|
"""Run BrowseComp benchmark."""
|
|
print("\n=== BrowseComp Benchmark ===")
|
|
print(f"Running with {args.examples} examples")
|
|
print(f"Search iterations: {args.iterations}")
|
|
print(f"Questions per iteration: {args.questions}")
|
|
print(f"Search tool: {args.search_tool}")
|
|
print(f"Human evaluation: {args.human_eval}")
|
|
print(f"Output directory: {args.output_dir}")
|
|
print("=" * 30)
|
|
|
|
# Run benchmark
|
|
result = evaluate_browsecomp(
|
|
num_examples=args.examples,
|
|
search_iterations=args.iterations,
|
|
questions_per_iteration=args.questions,
|
|
search_tool=args.search_tool,
|
|
human_evaluation=args.human_eval,
|
|
output_dir=args.output_dir,
|
|
)
|
|
|
|
# Print results
|
|
if "metrics" in result:
|
|
print("\nResults:")
|
|
print(f" Accuracy: {result['metrics'].get('accuracy', 0):.3f}")
|
|
print(f" Total examples: {result['total_examples']}")
|
|
print(f" Correct answers: {result['metrics'].get('correct', 0)}")
|
|
print(
|
|
f" Average time: {result['metrics'].get('average_processing_time', 0):.2f}s"
|
|
)
|
|
print(f"\nReport saved to: {result.get('report_path', 'N/A')}")
|
|
else:
|
|
print("\nBenchmark completed without evaluation")
|
|
print(f" Results saved to: {result.get('results_path', 'N/A')}")
|
|
|
|
|
|
def run_comparison_example(args):
|
|
"""Run configuration comparison."""
|
|
print("\n=== Configuration Comparison ===")
|
|
print(f"Dataset: {args.benchmark}")
|
|
print(f"Examples per configuration: {args.examples}")
|
|
print(f"Output directory: {args.output_dir}")
|
|
print("=" * 30)
|
|
|
|
# Define configurations to compare
|
|
configurations = [
|
|
{
|
|
"name": "Base Config",
|
|
"search_tool": args.search_tool,
|
|
"iterations": 1,
|
|
"questions_per_iteration": 3,
|
|
},
|
|
{
|
|
"name": "More Iterations",
|
|
"search_tool": args.search_tool,
|
|
"iterations": 3,
|
|
"questions_per_iteration": 3,
|
|
},
|
|
{
|
|
"name": "More Questions",
|
|
"search_tool": args.search_tool,
|
|
"iterations": 1,
|
|
"questions_per_iteration": 5,
|
|
},
|
|
]
|
|
|
|
# Run comparison
|
|
result = compare_configurations(
|
|
dataset_type="simpleqa", # Use SimpleQA for faster comparison
|
|
num_examples=args.examples,
|
|
configurations=configurations,
|
|
output_dir=args.output_dir,
|
|
)
|
|
|
|
# Print results
|
|
print("\nComparison Results:")
|
|
print(f" Configurations tested: {result['configurations_tested']}")
|
|
print(f" Report saved to: {result['report_path']}")
|
|
|
|
# Print brief comparison table
|
|
print("\nResults Summary:")
|
|
print("Configuration | Accuracy | Avg. Time")
|
|
print("--------------- | -------- | ---------")
|
|
for res in result["results"]:
|
|
name = res["configuration_name"]
|
|
acc = res.get("metrics", {}).get("accuracy", 0)
|
|
time = res.get("metrics", {}).get("average_processing_time", 0)
|
|
print(f"{name:15} | {acc:.3f} | {time:.2f}s")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|