local-deep-research/examples/example_browsecomp.py

#!/usr/bin/env python
"""
Test script to validate BrowseComp dataset loading and decryption.
This helps debug issues with the BrowseComp dataset.
"""

import logging
import sys

# Configure logging
logging.basicConfig(
    level=logging.DEBUG,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
logger = logging.getLogger()

# Add path to import local_deep_research
sys.path.append(".")

try:
    from local_deep_research.benchmarks.datasets import decrypt, load_dataset
except ImportError as e:
    print(f"Error importing modules: {e}")
    sys.exit(1)


def test_browsecomp_decryption():
    """Test decryption of BrowseComp dataset."""
    print("\n=== Testing BrowseComp Decryption ===\n")

    try:
        # Load a small number of examples to test
        examples = load_dataset("browsecomp", num_examples=3)

        if not examples:
            print("Error: No examples loaded from dataset")
            return

        print(
            f"Successfully loaded {len(examples)} examples from BrowseComp dataset\n"
        )

        # Check if decryption worked by examining examples
        for i, example in enumerate(examples):
            print(f"Example {i + 1}:")
            print(f"  ID: {example.get('id', 'unknown')}")

            # Check if we have decrypted data
            if "original_problem" in example:
                print("  Decryption successful!")
                print(
                    f"  Original problem (encrypted): {example.get('original_problem', '')[:50]}..."
                )
                print(
                    f"  Decrypted problem: {example.get('problem', '')[:50]}..."
                )
                print(
                    f"  Decrypted answer: {example.get('correct_answer', '')[:50]}..."
                )
            else:
                print(
                    "  Decryption may have failed - no original_problem field"
                )
                print(f"  Problem: {example.get('problem', '')[:50]}...")
                print(f"  Answer: {example.get('answer', '')[:50]}...")

                # Try manual decryption
                canary = example.get("canary", "")
                if canary:
                    print("\n  Attempting manual decryption...")
                    try:
                        problem = example.get("problem", "")
                        answer = example.get("answer", "")

                        decrypted_problem = decrypt(problem, canary)
                        decrypted_answer = decrypt(answer, canary)

                        print(
                            f"  Manually decrypted problem: {decrypted_problem[:50]}..."
                        )
                        print(
                            f"  Manually decrypted answer: {decrypted_answer[:50]}..."
                        )
                    except Exception as e:
                        print(f"  Manual decryption failed: {e}")
                else:
                    print("  No canary found for manual decryption")

            print()

    except Exception as e:
        print(f"Error in test: {e}")


def test_simpleqa_loading():
    """Test loading of SimpleQA dataset for comparison."""
    print("\n=== Testing SimpleQA Loading ===\n")

    try:
        # Load a small number of examples to test
        examples = load_dataset("simpleqa", num_examples=3)

        if not examples:
            print("Error: No examples loaded from dataset")
            return

        print(
            f"Successfully loaded {len(examples)} examples from SimpleQA dataset\n"
        )

        # Check examples
        for i, example in enumerate(examples):
            print(f"Example {i + 1}:")
            print(f"  ID: {example.get('id', 'unknown')}")
            print(f"  Problem: {example.get('problem', '')[:50]}...")
            print(f"  Answer: {example.get('answer', '')[:50]}...")
            print()

    except Exception as e:
        print(f"Error in test: {e}")


if __name__ == "__main__":
    # Test both datasets for comparison
    test_browsecomp_decryption()
    test_simpleqa_loading()