local-deep-research/tests/database/test_cache_models.py

"""Tests for cache-related database models."""

import hashlib
import json
import time
from datetime import datetime, timedelta, timezone, UTC

import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from local_deep_research.database.models import Base, Cache, SearchCache


class TestCacheModels:
    """Test suite for cache-related models."""

    @pytest.fixture
    def engine(self):
        """Create an in-memory SQLite database for testing."""
        engine = create_engine("sqlite:///:memory:")
        Base.metadata.create_all(engine)
        yield engine
        engine.dispose()

    @pytest.fixture
    def session(self, engine):
        """Create a database session for testing."""
        Session = sessionmaker(bind=engine)
        session = Session()
        yield session
        session.close()

    def test_cache_creation(self, session):
        """Test creating basic cache entries."""
        cache_entry = Cache(
            cache_key="llm_response_12345",
            cache_text="This is the cached LLM response for the query about quantum physics.",
            cache_type="llm_response",
            source="openai",
            ttl_seconds=86400,  # 24 hours
            expires_at=datetime.now(UTC) + timedelta(hours=24),
            cache_value={
                "model": "gpt-4",
                "temperature": 0.7,
                "query": "explain quantum entanglement",
            },
            size_bytes=1024,
        )

        session.add(cache_entry)
        session.commit()

        # Verify cache entry
        saved = session.query(Cache).first()
        assert saved is not None
        assert saved.cache_key == "llm_response_12345"
        assert "quantum physics" in saved.cache_text
        assert saved.cache_type == "llm_response"
        assert saved.cache_value["model"] == "gpt-4"
        assert saved.hit_count == 0
        assert saved.size_bytes == 1024

    def test_cache_expiration(self, session):
        """Test cache expiration functionality."""
        now = datetime.now(UTC)

        # Create expired cache
        expired = Cache(
            cache_key="expired_cache",
            cache_text="Old data",
            cache_type="test",
            expires_at=now - timedelta(hours=1),
        )

        # Create valid cache
        valid = Cache(
            cache_key="valid_cache",
            cache_text="Fresh data",
            cache_type="test",
            expires_at=now + timedelta(hours=1),
        )

        # Create non-expiring cache
        permanent = Cache(
            cache_key="permanent_cache",
            cache_text="Never expires",
            cache_type="test",
            expires_at=None,
        )

        session.add_all([expired, valid, permanent])
        session.commit()

        # Test is_expired method
        assert expired.is_expired() is True
        assert valid.is_expired() is False
        assert permanent.is_expired() is False

        # Query non-expired entries
        non_expired = (
            session.query(Cache)
            .filter((Cache.expires_at.is_(None)) | (Cache.expires_at > now))
            .all()
        )

        assert len(non_expired) == 2
        keys = [c.cache_key for c in non_expired]
        assert "valid_cache" in keys
        assert "permanent_cache" in keys

    def test_search_cache(self, session):
        """Test search-specific cache functionality."""
        query = "quantum physics research"
        query_hash = hashlib.sha256(query.encode()).hexdigest()

        current_time = int(time.time())
        search_cache = SearchCache(
            query_hash=query_hash,
            query_text=query,
            results=json.dumps(
                [
                    {
                        "title": "Quantum Mechanics",
                        "url": "https://example.com/qm",
                    },
                    {"title": "Physics Today", "url": "https://example.com/pt"},
                ]
            ),
            created_at=current_time,
            expires_at=current_time + 21600,  # 6 hours
            last_accessed=current_time,
            access_count=1,
        )

        session.add(search_cache)
        session.commit()

        # Verify search cache
        saved = (
            session.query(SearchCache).filter_by(query_hash=query_hash).first()
        )
        assert saved is not None
        assert saved.query_text == query
        results = json.loads(saved.results)
        assert len(results) == 2
        assert saved.access_count == 1

    def test_cache_categories(self, session):
        """Test different cache categories."""
        categories = [
            ("llm_response", "AI generated content", "openai"),
            ("search_result", "Search engine results", "google"),
            ("api_response", "External API response", "external"),
            ("computation", "Expensive computation result", "local"),
        ]

        for cache_type, value, source in categories:
            cache = Cache(
                cache_key=f"{cache_type}_test",
                cache_text=value,
                cache_type=cache_type,
                source=source,
                expires_at=datetime.now(UTC) + timedelta(hours=1),
            )
            session.add(cache)

        session.commit()

        # Query by category
        llm_caches = (
            session.query(Cache).filter_by(cache_type="llm_response").all()
        )
        assert len(llm_caches) == 1
        assert llm_caches[0].cache_text == "AI generated content"
        assert llm_caches[0].source == "openai"

    def test_cache_hit_tracking(self, session):
        """Test cache hit counting and access time updates."""
        cache = Cache(
            cache_key="hit_test",
            cache_text="Test content",
            cache_type="test",
            hit_count=0,
        )

        session.add(cache)
        session.commit()

        # Record multiple hits
        original_accessed = cache.accessed_at
        for i in range(5):
            cache.record_hit()
            session.commit()

        assert cache.hit_count == 5
        assert cache.accessed_at > original_accessed

    def test_search_cache_deduplication(self, session):
        """Test that identical queries produce the same hash."""
        query1 = "machine learning algorithms"
        query2 = "machine learning algorithms"  # Same query
        query3 = "Machine Learning Algorithms"  # Different case

        hash1 = hashlib.sha256(query1.encode()).hexdigest()
        hash2 = hashlib.sha256(query2.encode()).hexdigest()
        hash3 = hashlib.sha256(query3.encode()).hexdigest()

        assert hash1 == hash2
        assert hash1 != hash3  # Different case produces different hash

    def test_cache_size_management(self, session):
        """Test tracking cache entry sizes."""
        large_text = "x" * 10000
        small_text = "small"

        large_cache = Cache(
            cache_key="large_entry",
            cache_text=large_text,
            cache_type="test",
            size_bytes=len(large_text.encode()),
        )

        small_cache = Cache(
            cache_key="small_entry",
            cache_text=small_text,
            cache_type="test",
            size_bytes=len(small_text.encode()),
        )

        session.add_all([large_cache, small_cache])
        session.commit()

        # Query total cache size - sum all sizes
        from sqlalchemy import func

        total_size = (
            session.query(func.sum(Cache.size_bytes))
            .filter(Cache.size_bytes.isnot(None))
            .scalar()
            or 0
        )

        assert large_cache.size_bytes > small_cache.size_bytes
        assert total_size > 10000

    def test_cache_metadata_usage(self, session):
        """Test storing and retrieving cache metadata."""
        metadata = {
            "model": "gpt-4",
            "temperature": 0.7,
            "max_tokens": 1000,
            "timestamp": "2024-01-01T00:00:00Z",
        }

        cache = Cache(
            cache_key="metadata_test",
            cache_text="Response text",
            cache_type="llm_response",
            cache_value=metadata,
        )

        session.add(cache)
        session.commit()

        saved = session.query(Cache).first()
        assert saved.cache_value == metadata
        assert saved.cache_value["model"] == "gpt-4"

    def test_search_cache_with_filters(self, session):
        """Test search cache with various filter parameters."""
        current_time = int(time.time())

        # Add multiple search caches
        queries = [
            ("python tutorials", current_time - 3600),  # 1 hour ago
            ("javascript frameworks", current_time - 7200),  # 2 hours ago
            ("rust programming", current_time - 86400),  # 1 day ago
        ]

        for query, created in queries:
            query_hash = hashlib.sha256(query.encode()).hexdigest()
            cache = SearchCache(
                query_hash=query_hash,
                query_text=query,
                results=json.dumps([{"title": f"Result for {query}"}]),
                created_at=created,
                expires_at=created + 86400,  # 24 hour TTL
                last_accessed=created,
                access_count=1,
            )
            session.add(cache)

        session.commit()

        # Query recent caches (last 3 hours)
        recent_threshold = current_time - 10800
        recent_caches = (
            session.query(SearchCache)
            .filter(SearchCache.created_at >= recent_threshold)
            .all()
        )

        assert len(recent_caches) == 2

    def test_cache_cleanup_old_entries(self, session):
        """Test cleanup of expired cache entries."""
        now = datetime.now(timezone.utc)

        # Create caches with different expiration times
        for i in range(10):
            cache = Cache(
                cache_key=f"cache_{i}",
                cache_text=f"Content {i}",
                cache_type="test",
                expires_at=now - timedelta(hours=i),  # Some expired, some not
            )
            session.add(cache)

        session.commit()

        # Delete expired entries
        session.query(Cache).filter(Cache.expires_at < now).delete()
        session.commit()

        # Verify cleanup
        remaining = session.query(Cache).count()
        assert remaining == 1  # Only cache_0 should remain (expires_at = now)

    def test_cache_update_operations(self, session):
        """Test updating cache entries."""
        cache = Cache(
            cache_key="update_test",
            cache_text="Original content",
            cache_type="test",
            ttl_seconds=3600,
        )
        cache.set_ttl(3600)  # Set TTL

        session.add(cache)
        session.commit()

        # Update content
        cache.cache_text = "Updated content"
        cache.cache_value = {"version": 2}
        session.commit()

        # Verify updates
        saved = session.query(Cache).filter_by(cache_key="update_test").first()
        assert saved.cache_text == "Updated content"
        assert saved.cache_value["version"] == 2
        assert saved.ttl_seconds == 3600
        assert saved.expires_at is not None