mirror of
https://github.com/LearningCircuit/local-deep-research.git
synced 2026-06-16 20:10:39 +03:00
* feat: Add pre-commit hook to enforce pathlib usage (issue #640) - Created check-pathlib-usage.py pre-commit hook using AST parsing - Detects os.path usage and suggests pathlib alternatives - Fixed os.path.normpath usage in auth/routes.py to use PurePosixPath - Added hook configuration to .pre-commit-config.yaml The hook provides helpful suggestions for replacing os.path calls with their pathlib equivalents for better cross-platform compatibility. Co-Authored-By: djpetti <djpetti@users.noreply.github.com> * feat: Add missing pathlib pre-commit hook script Co-Authored-By: djpetti <djpetti@users.noreply.github.com> * refactor: Migrate core src modules from os.path to pathlib - Fixed web/app_factory.py, config/llm_config.py, metrics/token_counter.py - Fixed utilities/es_utils.py, web/routes/benchmark_routes.py - Fixed web/routes/settings_routes.py, web_search_engines/engines/search_engine_local.py - Replaced os.path.join() with Path() / syntax - Replaced os.path.exists() with Path().exists() - Replaced os.path.basename() with Path().name - Replaced os.path.dirname() with Path().parent Part of the migration to modern pathlib API for better cross-platform compatibility and cleaner code. Co-Authored-By: djpetti <djpetti@users.noreply.github.com> * refactor: Migrate from os.path to pathlib in src and tests (issue #640) Replaced os.path usage with pathlib.Path throughout: - src/local_deep_research/benchmarks: All os.path.join, exists, dirname, basename, abspath replaced - tests directory: Complete migration of all test files - Improved cross-platform compatibility and code readability - Kept os.path.expandvars in env_settings.py (no pathlib equivalent) Part of pre-commit hook enforcement for pathlib usage. Remaining work: examples/ and scripts/ directories. Co-Authored-By: djpetti * fix: Complete migration from os.path to pathlib.Path (issue #640) Completed manual migration of all os.path usage to pathlib.Path across: - scripts/ directory (3 files) - examples/ directory (25 files total) - examples/benchmarks/ (8 files) - examples/optimization/ (16 files) - examples/show_env_vars.py - src/local_deep_research/settings/env_settings.py Changes made: - Replaced os.path.join() with Path() / syntax - Replaced os.path.exists() with Path().exists() - Replaced os.path.dirname() with Path().parent - Replaced os.path.basename() with Path().name or Path().stem - Replaced os.path.abspath() with Path().resolve() - Replaced os.makedirs() with Path().mkdir(parents=True, exist_ok=True) - Added pathlib import where needed Note: Kept os.path.expandvars in env_settings.py as there is no pathlib equivalent. Added comment explaining this limitation. This completes the pathlib migration for issue #640. Co-Authored-By: djpetti * fix: Allow os.path.expandvars in pathlib pre-commit hook Updated the check-pathlib-usage.py pre-commit hook to skip checking os.path.expandvars since it has no pathlib equivalent. Changes: - Added exception for expandvars in both visit_Attribute and visit_Call methods - Added comment in equivalents dictionary noting expandvars is allowed - This allows env_settings.py to use os.path.expandvars without failing checks This resolves the pre-commit CI failure while maintaining the pathlib enforcement for all other os.path methods. Co-Authored-By: djpetti --------- Co-authored-by: djpetti
309 lines
10 KiB
Python
Executable File
309 lines
10 KiB
Python
Executable File
#!/usr/bin/env python
|
|
"""
|
|
Fixed BrowseComp benchmark script that properly handles decryption.
|
|
This version uses the canary string from each example as the decryption key.
|
|
"""
|
|
|
|
import base64
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import pandas as pd
|
|
|
|
# Set up Python path
|
|
current_dir = str(Path(__file__).parent.resolve())
|
|
sys.path.insert(0, current_dir)
|
|
|
|
try:
|
|
from local_deep_research.api import quick_summary
|
|
except ImportError as e:
|
|
print(f"Error importing from api: {e}")
|
|
sys.exit(1)
|
|
|
|
# The template for formatting queries to the model
|
|
QUERY_TEMPLATE = """
|
|
{Question}
|
|
|
|
Your response should be in the following format:
|
|
Explanation: {{your explanation for your final answer}}
|
|
Exact Answer: {{your succinct, final answer}}
|
|
Confidence: {{your confidence score between 0% and 100% for your answer}}
|
|
""".strip()
|
|
|
|
# Grader template from BrowseComp
|
|
GRADER_TEMPLATE = """
|
|
Judge whether the following [response] to [question] is correct or not based on the precise and unambiguous [correct_answer] below.
|
|
|
|
[question]: {question}
|
|
|
|
[response]: {response}
|
|
|
|
Your judgement must be in the format and criteria specified below:
|
|
|
|
extracted_final_answer: The final exact answer extracted from the [response]. Put the extracted answer as 'None' if there is no exact, final answer to extract from the response.
|
|
|
|
[correct_answer]: {correct_answer}
|
|
|
|
reasoning: Explain why the extracted_final_answer is correct or incorrect based on [correct_answer], focusing only on if there are meaningful differences between [correct_answer] and the extracted_final_answer. Do not comment on any background to the problem, do not attempt to solve the problem, do not argue for any answer different than [correct_answer], focus only on whether the answers match.
|
|
|
|
correct: Answer 'yes' if extracted_final_answer matches the [correct_answer] given above, or is within a small margin of error for numerical problems. Answer 'no' otherwise, i.e. if there if there is any inconsistency, ambiguity, non-equivalency, or if the extracted answer is incorrect.
|
|
|
|
confidence: The extracted confidence score between 0% and 100% from [response]. Put 100 if there is no confidence score available.
|
|
""".strip()
|
|
|
|
|
|
def derive_key(password: str, length: int) -> bytes:
|
|
"""Derive a fixed-length key from the password using SHA256."""
|
|
hasher = hashlib.sha256()
|
|
hasher.update(password.encode())
|
|
key = hasher.digest()
|
|
return key * (length // len(key)) + key[: length % len(key)]
|
|
|
|
|
|
def decrypt(ciphertext_b64: str, password: str) -> str:
|
|
"""Decrypt base64-encoded ciphertext with XOR."""
|
|
try:
|
|
encrypted = base64.b64decode(ciphertext_b64)
|
|
key = derive_key(password, len(encrypted))
|
|
decrypted = bytes(a ^ b for a, b in zip(encrypted, key, strict=False))
|
|
return decrypted.decode()
|
|
except Exception as e:
|
|
print(f"Error decrypting data: {e!s}")
|
|
return f"Error: Could not decrypt data: {str(e)[:100]}"
|
|
|
|
|
|
def run_browsecomp_evaluation(
|
|
dataset_path: str = "https://openaipublic.blob.core.windows.net/simple-evals/browse_comp_test_set.csv",
|
|
output_dir: str = "benchmark_results/browsecomp",
|
|
output_file: str = "ldr_browsecomp_results.jsonl",
|
|
num_examples: Optional[int] = None,
|
|
seed: int = 42,
|
|
search_iterations: int = 2,
|
|
questions_per_iteration: int = 9,
|
|
search_tool: str = "searxng",
|
|
):
|
|
"""
|
|
Run the BrowseComp evaluation using Local Deep Research.
|
|
"""
|
|
# Ensure output directory exists
|
|
Path(output_dir).mkdir(parents=True, exist_ok=True)
|
|
output_path = str(Path(output_dir) / output_file)
|
|
|
|
# Load BrowseComp dataset
|
|
print(f"Loading dataset from {dataset_path}")
|
|
df = pd.read_csv(dataset_path)
|
|
|
|
examples = [row.to_dict() for _, row in df.iterrows()]
|
|
|
|
# Display sample canary
|
|
if examples:
|
|
print(
|
|
f"Sample canary: {examples[0].get('canary', 'No canary found')[:30]}..."
|
|
)
|
|
|
|
# Sample examples if specified
|
|
if num_examples and num_examples < len(examples):
|
|
random.seed(seed)
|
|
examples = random.sample(examples, num_examples)
|
|
print(f"Sampled {num_examples} examples from {len(df)} total examples")
|
|
|
|
# Remove output file if it exists to avoid appending
|
|
if Path(output_path).exists():
|
|
os.remove(output_path)
|
|
|
|
results = []
|
|
correct_count = 0
|
|
|
|
print("\nStarting BrowseComp evaluation with settings:")
|
|
print(f"- Number of examples: {len(examples)}")
|
|
print(f"- Search iterations: {search_iterations}")
|
|
print(f"- Questions per iteration: {questions_per_iteration}")
|
|
print(f"- Search tool: {search_tool}")
|
|
print(f"- Output file: {output_path}")
|
|
|
|
# Process each question
|
|
for i, example in enumerate(examples):
|
|
# Decrypt the problem and answer using the canary
|
|
try:
|
|
problem = decrypt(
|
|
example.get("problem", ""), example.get("canary", "")
|
|
)
|
|
correct_answer = decrypt(
|
|
example.get("answer", ""), example.get("canary", "")
|
|
)
|
|
|
|
print(f"\nProcessing {i + 1}/{len(examples)}: {problem[:100]}...")
|
|
print(f"Correct answer: {correct_answer[:100]}...")
|
|
except Exception as e:
|
|
print(f"Error decrypting problem/answer: {e}")
|
|
problem = f"Error decrypting: {str(e)[:50]}"
|
|
correct_answer = "Unknown due to decryption error"
|
|
|
|
# Format the question using the QUERY_TEMPLATE
|
|
formatted_question = QUERY_TEMPLATE.format(Question=problem)
|
|
|
|
try:
|
|
# Query using quick_summary with specified parameters
|
|
summary = quick_summary(
|
|
query=formatted_question,
|
|
iterations=search_iterations,
|
|
questions_per_iteration=questions_per_iteration,
|
|
search_tool=search_tool,
|
|
)
|
|
|
|
# Extract the response
|
|
response = summary.get("summary", "")
|
|
|
|
# Clean up the response for better evaluation
|
|
response = (
|
|
response.replace("[1]", "")
|
|
.replace("[2]", "")
|
|
.replace("[3]", "")
|
|
)
|
|
response = " ".join(
|
|
[
|
|
line
|
|
for line in response.split("\n")
|
|
if not line.startswith("[")
|
|
]
|
|
)
|
|
|
|
# Extract the final answer from the response
|
|
answer_match = re.search(r"Exact Answer:\s*(.*?)(?:\n|$)", response)
|
|
exact_answer = (
|
|
answer_match.group(1).strip() if answer_match else "None"
|
|
)
|
|
|
|
# Extract confidence from the response
|
|
confidence_match = re.search(r"Confidence:\s*(\d+)%", response)
|
|
confidence = (
|
|
confidence_match.group(1) if confidence_match else "100"
|
|
)
|
|
|
|
# Simple accuracy check (for basic reporting)
|
|
# Note: Real evaluation would use a more sophisticated approach
|
|
is_correct = exact_answer.lower() == correct_answer.lower()
|
|
if is_correct:
|
|
correct_count += 1
|
|
|
|
# Format result for output
|
|
result = {
|
|
"id": example.get("id", f"q{i}"),
|
|
"problem": problem,
|
|
"correct_answer": correct_answer,
|
|
"response": response,
|
|
"extracted_answer": exact_answer,
|
|
"confidence": confidence,
|
|
"is_correct": is_correct,
|
|
}
|
|
|
|
# Write incrementally to output file
|
|
with open(output_path, "a") as f:
|
|
f.write(json.dumps(result) + "\n")
|
|
|
|
results.append(result)
|
|
|
|
# Print progress
|
|
print(f" Response: {exact_answer}")
|
|
print(f" Correct: {is_correct}")
|
|
print(
|
|
f" Current accuracy: {correct_count}/{i + 1} ({(correct_count / (i + 1)) * 100:.1f}%)"
|
|
)
|
|
|
|
except Exception as e:
|
|
print(f"Error processing question {i + 1}: {e!s}")
|
|
# In case of error, write a placeholder result
|
|
result = {
|
|
"id": example.get("id", f"q{i}"),
|
|
"problem": problem,
|
|
"correct_answer": correct_answer,
|
|
"response": f"Error processing this question: {str(e)[:100]}",
|
|
"extracted_answer": "None",
|
|
"confidence": "0",
|
|
"is_correct": False,
|
|
}
|
|
with open(output_path, "a") as f:
|
|
f.write(json.dumps(result) + "\n")
|
|
|
|
results.append(result)
|
|
|
|
# Calculate overall accuracy
|
|
accuracy = correct_count / len(examples) if examples else 0
|
|
|
|
# Write summary report
|
|
report = {
|
|
"total_examples": len(examples),
|
|
"correct_count": correct_count,
|
|
"accuracy": accuracy,
|
|
"search_iterations": search_iterations,
|
|
"questions_per_iteration": questions_per_iteration,
|
|
"search_tool": search_tool,
|
|
}
|
|
|
|
report_path = str(Path(output_dir) / "browsecomp_summary.json")
|
|
with open(report_path, "w") as f:
|
|
json.dump(report, f, indent=2)
|
|
|
|
print("\nEvaluation complete.")
|
|
print(f"Results saved to {output_path}")
|
|
print(f"Summary saved to {report_path}")
|
|
print(f"Final accuracy: {accuracy:.4f} ({correct_count}/{len(examples)})")
|
|
|
|
return results
|
|
|
|
|
|
# Main execution
|
|
if __name__ == "__main__":
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Run BrowseComp benchmark with proper decryption"
|
|
)
|
|
parser.add_argument(
|
|
"--examples",
|
|
type=int,
|
|
default=10,
|
|
help="Number of examples to use (default: 10)",
|
|
)
|
|
parser.add_argument(
|
|
"--iterations",
|
|
type=int,
|
|
default=2,
|
|
help="Search iterations (default: 2)",
|
|
)
|
|
parser.add_argument(
|
|
"--questions",
|
|
type=int,
|
|
default=9,
|
|
help="Questions per iteration (default: 9)",
|
|
)
|
|
parser.add_argument(
|
|
"--search-tool",
|
|
type=str,
|
|
default="searxng",
|
|
help="Search tool to use (default: searxng)",
|
|
)
|
|
parser.add_argument(
|
|
"--output-dir",
|
|
type=str,
|
|
default="benchmark_results/browsecomp",
|
|
help="Output directory (default: benchmark_results/browsecomp)",
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
print("Starting BrowseComp benchmark with proper decryption...")
|
|
run_browsecomp_evaluation(
|
|
num_examples=args.examples,
|
|
search_iterations=args.iterations,
|
|
questions_per_iteration=args.questions,
|
|
search_tool=args.search_tool,
|
|
output_dir=args.output_dir,
|
|
)
|