mirror of
https://github.com/LearningCircuit/local-deep-research.git
synced 2026-06-15 19:46:56 +03:00
- Replace standard logging with loguru across codebase (50+ files) - Fix logger.exception usage in exception handlers (100+ fixes) - Update environment variable access to use SettingsManager - Improve pre-commit hooks to handle legitimate use cases - Fix formatting and linting issues Reduced custom code check violations from 268 to 38 (86% reduction) Reduced env var check violations from 73 to 57 (22% reduction)
126 lines
4.0 KiB
Python
126 lines
4.0 KiB
Python
#!/usr/bin/env python
|
|
"""
|
|
Test script to validate BrowseComp dataset loading and decryption.
|
|
This helps debug issues with the BrowseComp dataset.
|
|
"""
|
|
|
|
import sys
|
|
|
|
from loguru import logger
|
|
|
|
# Logger is already imported from loguru
|
|
# Set debug level for this script
|
|
logger.remove()
|
|
logger.add(sys.stderr, level="DEBUG")
|
|
|
|
# Add path to import local_deep_research
|
|
sys.path.append(".")
|
|
|
|
try:
|
|
from local_deep_research.benchmarks.datasets import decrypt, load_dataset
|
|
except ImportError as e:
|
|
print(f"Error importing modules: {e}")
|
|
sys.exit(1)
|
|
|
|
|
|
def test_browsecomp_decryption():
|
|
"""Test decryption of BrowseComp dataset."""
|
|
print("\n=== Testing BrowseComp Decryption ===\n")
|
|
|
|
try:
|
|
# Load a small number of examples to test
|
|
examples = load_dataset("browsecomp", num_examples=3)
|
|
|
|
if not examples:
|
|
print("Error: No examples loaded from dataset")
|
|
return
|
|
|
|
print(
|
|
f"Successfully loaded {len(examples)} examples from BrowseComp dataset\n"
|
|
)
|
|
|
|
# Check if decryption worked by examining examples
|
|
for i, example in enumerate(examples):
|
|
print(f"Example {i + 1}:")
|
|
print(f" ID: {example.get('id', 'unknown')}")
|
|
|
|
# Check if we have decrypted data
|
|
if "original_problem" in example:
|
|
print(" Decryption successful!")
|
|
print(
|
|
f" Original problem (encrypted): {example.get('original_problem', '')[:50]}..."
|
|
)
|
|
print(
|
|
f" Decrypted problem: {example.get('problem', '')[:50]}..."
|
|
)
|
|
print(
|
|
f" Decrypted answer: {example.get('correct_answer', '')[:50]}..."
|
|
)
|
|
else:
|
|
print(
|
|
" Decryption may have failed - no original_problem field"
|
|
)
|
|
print(f" Problem: {example.get('problem', '')[:50]}...")
|
|
print(f" Answer: {example.get('answer', '')[:50]}...")
|
|
|
|
# Try manual decryption
|
|
canary = example.get("canary", "")
|
|
if canary:
|
|
print("\n Attempting manual decryption...")
|
|
try:
|
|
problem = example.get("problem", "")
|
|
answer = example.get("answer", "")
|
|
|
|
decrypted_problem = decrypt(problem, canary)
|
|
decrypted_answer = decrypt(answer, canary)
|
|
|
|
print(
|
|
f" Manually decrypted problem: {decrypted_problem[:50]}..."
|
|
)
|
|
print(
|
|
f" Manually decrypted answer: {decrypted_answer[:50]}..."
|
|
)
|
|
except Exception as e:
|
|
print(f" Manual decryption failed: {e}")
|
|
else:
|
|
print(" No canary found for manual decryption")
|
|
|
|
print()
|
|
|
|
except Exception as e:
|
|
print(f"Error in test: {e}")
|
|
|
|
|
|
def test_simpleqa_loading():
|
|
"""Test loading of SimpleQA dataset for comparison."""
|
|
print("\n=== Testing SimpleQA Loading ===\n")
|
|
|
|
try:
|
|
# Load a small number of examples to test
|
|
examples = load_dataset("simpleqa", num_examples=3)
|
|
|
|
if not examples:
|
|
print("Error: No examples loaded from dataset")
|
|
return
|
|
|
|
print(
|
|
f"Successfully loaded {len(examples)} examples from SimpleQA dataset\n"
|
|
)
|
|
|
|
# Check examples
|
|
for i, example in enumerate(examples):
|
|
print(f"Example {i + 1}:")
|
|
print(f" ID: {example.get('id', 'unknown')}")
|
|
print(f" Problem: {example.get('problem', '')[:50]}...")
|
|
print(f" Answer: {example.get('answer', '')[:50]}...")
|
|
print()
|
|
|
|
except Exception as e:
|
|
print(f"Error in test: {e}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Test both datasets for comparison
|
|
test_browsecomp_decryption()
|
|
test_simpleqa_loading()
|