#!/usr/bin/env python """ Example script for running benchmarks using the Local Deep Research benchmarking framework. This script demonstrates how to run SimpleQA and BrowseComp benchmarks programmatically. """ import argparse import os from local_deep_research.api.benchmark_functions import ( compare_configurations, evaluate_browsecomp, evaluate_simpleqa, ) def main(): """Run benchmark examples.""" parser = argparse.ArgumentParser(description="LDR Benchmark Examples") parser.add_argument( "--benchmark", choices=["simpleqa", "browsecomp", "compare"], default="simpleqa", help="Benchmark to run", ) parser.add_argument( "--examples", type=int, default=10, help="Number of examples to use" ) parser.add_argument( "--iterations", type=int, default=3, help="Number of search iterations" ) parser.add_argument( "--questions", type=int, default=3, help="Questions per iteration" ) parser.add_argument( "--search-tool", default="searxng", help="Search tool to use" ) parser.add_argument( "--human-eval", action="store_true", help="Use human evaluation" ) parser.add_argument( "--output-dir", default="benchmark_results", help="Directory to save results", ) args = parser.parse_args() # Create output directory if it doesn't exist os.makedirs(args.output_dir, exist_ok=True) print(f"Running {args.benchmark} benchmark with {args.examples} examples") # Run the specified benchmark if args.benchmark == "simpleqa": run_simpleqa_example(args) elif args.benchmark == "browsecomp": run_browsecomp_example(args) elif args.benchmark == "compare": run_comparison_example(args) else: print(f"Unknown benchmark: {args.benchmark}") def run_simpleqa_example(args): """Run SimpleQA benchmark.""" print("\n=== SimpleQA Benchmark ===") print(f"Running with {args.examples} examples") print(f"Search iterations: {args.iterations}") print(f"Questions per iteration: {args.questions}") print(f"Search tool: {args.search_tool}") print(f"Human evaluation: {args.human_eval}") print(f"Output directory: {args.output_dir}") print("=" * 30) # Run benchmark result = evaluate_simpleqa( num_examples=args.examples, search_iterations=args.iterations, questions_per_iteration=args.questions, search_tool=args.search_tool, human_evaluation=args.human_eval, output_dir=args.output_dir, ) # Print results if "metrics" in result: print("\nResults:") print(f" Accuracy: {result['metrics'].get('accuracy', 0):.3f}") print(f" Total examples: {result['total_examples']}") print(f" Correct answers: {result['metrics'].get('correct', 0)}") print( f" Average time: {result['metrics'].get('average_processing_time', 0):.2f}s" ) print(f"\nReport saved to: {result.get('report_path', 'N/A')}") else: print("\nBenchmark completed without evaluation") print(f" Results saved to: {result.get('results_path', 'N/A')}") def run_browsecomp_example(args): """Run BrowseComp benchmark.""" print("\n=== BrowseComp Benchmark ===") print(f"Running with {args.examples} examples") print(f"Search iterations: {args.iterations}") print(f"Questions per iteration: {args.questions}") print(f"Search tool: {args.search_tool}") print(f"Human evaluation: {args.human_eval}") print(f"Output directory: {args.output_dir}") print("=" * 30) # Run benchmark result = evaluate_browsecomp( num_examples=args.examples, search_iterations=args.iterations, questions_per_iteration=args.questions, search_tool=args.search_tool, human_evaluation=args.human_eval, output_dir=args.output_dir, ) # Print results if "metrics" in result: print("\nResults:") print(f" Accuracy: {result['metrics'].get('accuracy', 0):.3f}") print(f" Total examples: {result['total_examples']}") print(f" Correct answers: {result['metrics'].get('correct', 0)}") print( f" Average time: {result['metrics'].get('average_processing_time', 0):.2f}s" ) print(f"\nReport saved to: {result.get('report_path', 'N/A')}") else: print("\nBenchmark completed without evaluation") print(f" Results saved to: {result.get('results_path', 'N/A')}") def run_comparison_example(args): """Run configuration comparison.""" print("\n=== Configuration Comparison ===") print(f"Dataset: {args.benchmark}") print(f"Examples per configuration: {args.examples}") print(f"Output directory: {args.output_dir}") print("=" * 30) # Define configurations to compare configurations = [ { "name": "Base Config", "search_tool": args.search_tool, "iterations": 1, "questions_per_iteration": 3, }, { "name": "More Iterations", "search_tool": args.search_tool, "iterations": 3, "questions_per_iteration": 3, }, { "name": "More Questions", "search_tool": args.search_tool, "iterations": 1, "questions_per_iteration": 5, }, ] # Run comparison result = compare_configurations( dataset_type="simpleqa", # Use SimpleQA for faster comparison num_examples=args.examples, configurations=configurations, output_dir=args.output_dir, ) # Print results print("\nComparison Results:") print(f" Configurations tested: {result['configurations_tested']}") print(f" Report saved to: {result['report_path']}") # Print brief comparison table print("\nResults Summary:") print("Configuration | Accuracy | Avg. Time") print("--------------- | -------- | ---------") for res in result["results"]: name = res["configuration_name"] acc = res.get("metrics", {}).get("accuracy", 0) time = res.get("metrics", {}).get("average_processing_time", 0) print(f"{name:15} | {acc:.3f} | {time:.2f}s") if __name__ == "__main__": main()