Files
local-deep-research/examples/optimization/run_parallel_benchmark.py
LearningCircuit 0c6635ecc2 feat: Add pre-commit hook to enforce pathlib usage (issue #640) (#656)
* feat: Add pre-commit hook to enforce pathlib usage (issue #640)

- Created check-pathlib-usage.py pre-commit hook using AST parsing
- Detects os.path usage and suggests pathlib alternatives
- Fixed os.path.normpath usage in auth/routes.py to use PurePosixPath
- Added hook configuration to .pre-commit-config.yaml

The hook provides helpful suggestions for replacing os.path calls with
their pathlib equivalents for better cross-platform compatibility.

Co-Authored-By: djpetti <djpetti@users.noreply.github.com>

* feat: Add missing pathlib pre-commit hook script

Co-Authored-By: djpetti <djpetti@users.noreply.github.com>

* refactor: Migrate core src modules from os.path to pathlib

- Fixed web/app_factory.py, config/llm_config.py, metrics/token_counter.py
- Fixed utilities/es_utils.py, web/routes/benchmark_routes.py
- Fixed web/routes/settings_routes.py, web_search_engines/engines/search_engine_local.py
- Replaced os.path.join() with Path() / syntax
- Replaced os.path.exists() with Path().exists()
- Replaced os.path.basename() with Path().name
- Replaced os.path.dirname() with Path().parent

Part of the migration to modern pathlib API for better cross-platform
compatibility and cleaner code.

Co-Authored-By: djpetti <djpetti@users.noreply.github.com>

* refactor: Migrate from os.path to pathlib in src and tests (issue #640)

Replaced os.path usage with pathlib.Path throughout:
- src/local_deep_research/benchmarks: All os.path.join, exists, dirname, basename, abspath replaced
- tests directory: Complete migration of all test files
- Improved cross-platform compatibility and code readability
- Kept os.path.expandvars in env_settings.py (no pathlib equivalent)

Part of pre-commit hook enforcement for pathlib usage.
Remaining work: examples/ and scripts/ directories.

Co-Authored-By: djpetti

* fix: Complete migration from os.path to pathlib.Path (issue #640)

Completed manual migration of all os.path usage to pathlib.Path across:
- scripts/ directory (3 files)
- examples/ directory (25 files total)
  - examples/benchmarks/ (8 files)
  - examples/optimization/ (16 files)
  - examples/show_env_vars.py
- src/local_deep_research/settings/env_settings.py

Changes made:
- Replaced os.path.join() with Path() / syntax
- Replaced os.path.exists() with Path().exists()
- Replaced os.path.dirname() with Path().parent
- Replaced os.path.basename() with Path().name or Path().stem
- Replaced os.path.abspath() with Path().resolve()
- Replaced os.makedirs() with Path().mkdir(parents=True, exist_ok=True)
- Added pathlib import where needed

Note: Kept os.path.expandvars in env_settings.py as there is no pathlib
equivalent. Added comment explaining this limitation.

This completes the pathlib migration for issue #640.

Co-Authored-By: djpetti

* fix: Allow os.path.expandvars in pathlib pre-commit hook

Updated the check-pathlib-usage.py pre-commit hook to skip checking
os.path.expandvars since it has no pathlib equivalent.

Changes:
- Added exception for expandvars in both visit_Attribute and visit_Call methods
- Added comment in equivalents dictionary noting expandvars is allowed
- This allows env_settings.py to use os.path.expandvars without failing checks

This resolves the pre-commit CI failure while maintaining the pathlib
enforcement for all other os.path methods.

Co-Authored-By: djpetti

---------

Co-authored-by: djpetti
2025-08-17 22:52:35 +02:00

299 lines
8.8 KiB
Python
Executable File

#!/usr/bin/env python
"""
Run SimpleQA and BrowseComp benchmarks in parallel with 300 examples each.
This script demonstrates running multiple benchmarks in parallel with a large number of examples.
Usage:
# Install dependencies with PDM
cd /path/to/local-deep-research
pdm install
# Run the script with PDM
pdm run python examples/optimization/run_parallel_benchmark.py
"""
import argparse
import concurrent.futures
import os
import sys
import time
from datetime import datetime, UTC
from pathlib import Path
from loguru import logger
# Add the src directory to the Python path
project_root = str(Path(__file__).parent.parent.parent.resolve())
sys.path.insert(0, str(Path(project_root) / "src"))
def run_simpleqa_benchmark(
num_examples,
output_dir,
model=None,
provider=None,
endpoint_url=None,
api_key=None,
):
"""Run SimpleQA benchmark with specified number of examples."""
from local_deep_research.benchmarks.benchmark_functions import (
evaluate_simpleqa,
)
logger.info(f"Starting SimpleQA benchmark with {num_examples} examples")
start_time = time.time()
# Run the benchmark
results = evaluate_simpleqa(
num_examples=num_examples,
search_iterations=2,
questions_per_iteration=3,
search_strategy="source_based",
search_tool="searxng",
search_model=model,
search_provider=provider,
endpoint_url=endpoint_url,
output_dir=str(Path(output_dir) / "simpleqa"),
evaluation_provider="ANTHROPIC",
evaluation_model="claude-3-7-sonnet-20250219",
)
duration = time.time() - start_time
logger.info(f"SimpleQA benchmark completed in {duration:.1f} seconds")
if results and isinstance(results, dict):
logger.info(f"SimpleQA accuracy: {results.get('accuracy', 'N/A')}")
return results
def run_browsecomp_benchmark(
num_examples,
output_dir,
model=None,
provider=None,
endpoint_url=None,
api_key=None,
):
"""Run BrowseComp benchmark with specified number of examples."""
from local_deep_research.benchmarks.benchmark_functions import (
evaluate_browsecomp,
)
logger.info(f"Starting BrowseComp benchmark with {num_examples} examples")
start_time = time.time()
# Run the benchmark
results = evaluate_browsecomp(
num_examples=num_examples,
search_iterations=3,
questions_per_iteration=3,
search_strategy="source_based",
search_tool="searxng",
search_model=model,
search_provider=provider,
endpoint_url=endpoint_url,
output_dir=str(Path(output_dir) / "browsecomp"),
evaluation_provider="ANTHROPIC",
evaluation_model="claude-3-7-sonnet-20250219",
)
duration = time.time() - start_time
logger.info(f"BrowseComp benchmark completed in {duration:.1f} seconds")
if results and isinstance(results, dict):
logger.info(f"BrowseComp accuracy: {results.get('accuracy', 'N/A')}")
return results
def setup_llm_environment(
model=None, provider=None, endpoint_url=None, api_key=None
):
"""Set up environment variables for LLM configuration."""
if model:
os.environ["LDR_LLM__MODEL"] = model
logger.info(f"Using LLM model: {model}")
if provider:
os.environ["LDR_LLM__PROVIDER"] = provider
logger.info(f"Using LLM provider: {provider}")
if endpoint_url:
os.environ["OPENAI_ENDPOINT_URL"] = endpoint_url
os.environ["LDR_LLM__OPENAI_ENDPOINT_URL"] = endpoint_url
logger.info(f"Using endpoint URL: {endpoint_url}")
if api_key:
# Set the appropriate environment variable based on provider
if provider == "openai":
os.environ["OPENAI_API_KEY"] = api_key
os.environ["LDR_LLM__OPENAI_API_KEY"] = api_key
elif provider == "openai_endpoint":
os.environ["OPENAI_ENDPOINT_API_KEY"] = api_key
os.environ["LDR_LLM__OPENAI_ENDPOINT_API_KEY"] = api_key
elif provider == "anthropic":
os.environ["ANTHROPIC_API_KEY"] = api_key
os.environ["LDR_LLM__ANTHROPIC_API_KEY"] = api_key
logger.info("API key configured")
def main():
parser = argparse.ArgumentParser(
description="Run SimpleQA and BrowseComp benchmarks in parallel"
)
parser.add_argument(
"--examples",
type=int,
default=300,
help="Number of examples for each benchmark (default: 300)",
)
# LLM configuration options
parser.add_argument(
"--model",
help="Model name for the LLM (e.g., 'claude-3-sonnet-20240229')",
)
parser.add_argument(
"--provider",
help="Provider for the LLM (e.g., 'anthropic', 'openai', 'openai_endpoint')",
)
parser.add_argument(
"--endpoint-url",
help="Custom endpoint URL (e.g., 'https://openrouter.ai/api/v1')",
)
parser.add_argument("--api-key", help="API key for the LLM provider")
args = parser.parse_args()
# Create timestamp for unique output directory
timestamp = datetime.now(UTC).strftime("%Y%m%d_%H%M%S")
output_dir = str(
Path(project_root)
/ "benchmark_results"
/ f"parallel_benchmark_{timestamp}"
)
Path(output_dir).mkdir(parents=True, exist_ok=True)
# Display start information
print(f"Starting parallel benchmarks with {args.examples} examples each")
print(f"Results will be saved to: {output_dir}")
# Set up LLM environment if specified
setup_llm_environment(
model=args.model,
provider=args.provider,
endpoint_url=args.endpoint_url,
api_key=args.api_key,
)
# Start time for total execution
total_start_time = time.time()
# Run benchmarks in parallel using ThreadPoolExecutor
with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
# Submit both benchmark jobs
simpleqa_future = executor.submit(
run_simpleqa_benchmark,
args.examples,
output_dir,
args.model,
args.provider,
args.endpoint_url,
args.api_key,
)
browsecomp_future = executor.submit(
run_browsecomp_benchmark,
args.examples,
output_dir,
args.model,
args.provider,
args.endpoint_url,
args.api_key,
)
# Get results from both futures
try:
simpleqa_results = simpleqa_future.result()
print("SimpleQA benchmark completed successfully")
except Exception:
logger.exception("Error in SimpleQA benchmark")
simpleqa_results = None
try:
browsecomp_results = browsecomp_future.result()
print("BrowseComp benchmark completed successfully")
except Exception:
logger.exception("Error in BrowseComp benchmark")
browsecomp_results = None
# Calculate total time
total_duration = time.time() - total_start_time
# Print summary
print("\n" + "=" * 50)
print(" PARALLEL BENCHMARK SUMMARY ")
print("=" * 50)
print(f"Total duration: {total_duration:.1f} seconds")
print(f"Examples per benchmark: {args.examples}")
if simpleqa_results and isinstance(simpleqa_results, dict):
print(f"SimpleQA accuracy: {simpleqa_results.get('accuracy', 'N/A')}")
else:
print("SimpleQA: Failed or no results")
if browsecomp_results and isinstance(browsecomp_results, dict):
print(
f"BrowseComp accuracy: {browsecomp_results.get('accuracy', 'N/A')}"
)
else:
print("BrowseComp: Failed or no results")
print(f"Results saved to: {output_dir}")
print("=" * 50)
# Save summary to JSON file
try:
import json
summary = {
"timestamp": timestamp,
"examples_per_benchmark": args.examples,
"total_duration": total_duration,
"simpleqa": {
"accuracy": (
simpleqa_results.get("accuracy")
if simpleqa_results
else None
),
"completed": simpleqa_results is not None,
},
"browsecomp": {
"accuracy": (
browsecomp_results.get("accuracy")
if browsecomp_results
else None
),
"completed": browsecomp_results is not None,
},
"model": args.model,
"provider": args.provider,
}
with open(
Path(output_dir) / "parallel_benchmark_summary.json", "w"
) as f:
json.dump(summary, f, indent=2)
except Exception:
logger.exception("Error saving summary")
return 0
if __name__ == "__main__":
sys.exit(main())