Files
local-deep-research/tests/test_context_overflow_detection.py
LearningCircuit a385258c99 test: remove 7 no-assertion / placeholder tests across 7 files (#4247)
PUNCHLIST Tier 2 (no-assertions) cleanup. Each removed test had a
test name that promised coverage of a specific behavior but the body
contained no assertion exercising that behavior — they were silent
no-ops that always passed.

Removed
-------
1. tests/test_api_settings_advanced.py — test_web_api_still_uses_database
   Body ends with `pass  # Placeholder for actual web API testing`.

2. tests/test_api_settings_e2e.py — test_provider_fallback_scenario
   Sets up mocks + side_effect closure, builds a snapshot, never calls
   anything that exercises the fallback path. No assert.

3. tests/test_link_analytics.py — test_get_link_analytics_source_categorization
   Calls get_link_analytics() and exits. Trailing comment admits:
   "actual categorization is done by DomainClassifier ... so we can't
   assert exact counts". No assertion on the actual categorization.

4. tests/web/routes/test_context_overflow_api.py — test_get_metrics_no_username
   Patches login_required, opens an app context, body ends with `pass`.
   Was supposed to verify 401-when-no-username; verifies nothing.

5. tests/test_context_overflow_detection.py — test_token_usage_save_with_overflow_fields
   First line is `pytest.skip("Requires Flask application context")` —
   everything below is dead code. The test never runs and never has.

6. tests/api_tests/test_api_pytest.py — test_cors_headers, test_rate_limiting_headers
   Both only assert health endpoint returns 200. CORS / rate-limit
   header checks are commented-out TODOs. Test names lie about coverage.

7. tests/api_tests/test_rest_api.py — test_cors_headers
   Same anti-pattern. Asserts health returns 200, then prints a fake
   success line. No CORS verification.
2026-05-24 22:22:05 +02:00

542 lines
22 KiB
Python
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""Test context overflow detection for LLM calls."""
import os
import uuid
from unittest.mock import Mock, patch
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
from local_deep_research.database.models import Base
from local_deep_research.metrics.token_counter import TokenCountingCallback
class TestContextOverflowDetection:
"""Test suite for context overflow detection."""
@pytest.fixture
def db_session(self):
"""Create an in-memory database session for testing."""
engine = create_engine("sqlite:///:memory:")
Base.metadata.create_all(engine)
Session = sessionmaker(bind=engine)
session = Session()
yield session
session.close()
engine.dispose()
@pytest.fixture
def token_callback(self):
"""Create a token counting callback for testing."""
research_id = str(uuid.uuid4())
research_context = {
"research_query": "Test query",
"research_mode": "test",
"context_limit": 2048, # Set a specific context limit
"username": "test_user",
"user_password": "test_pass",
}
return TokenCountingCallback(research_id, research_context)
def test_context_overflow_detection_no_overflow(self, token_callback):
"""Test that no overflow is detected for small prompts."""
# Simulate LLM start with small prompt
prompts = ["What is 2+2?"]
token_callback.on_llm_start({}, prompts)
# Create mock response with Ollama-style metadata
mock_response = Mock()
mock_response.llm_output = None # Explicitly set to None
mock_response.generations = [[Mock()]]
mock_response.generations[0][0].message = Mock()
mock_response.generations[0][
0
].message.usage_metadata = None # No usage_metadata
mock_response.generations[0][0].message.response_metadata = {
"prompt_eval_count": 5, # Small token count
"eval_count": 10,
"total_duration": 1000000000, # 1 second in nanoseconds
}
# Process response
token_callback.on_llm_end(mock_response)
# Verify no overflow detected
assert token_callback.context_truncated is False
assert token_callback.tokens_truncated == 0
assert token_callback.truncation_ratio == 0.0
assert token_callback.ollama_metrics.get("prompt_eval_count") == 5
def test_context_overflow_detection_with_overflow(self, token_callback):
"""Test that overflow is detected when prompt approaches context limit."""
# Simulate LLM start with large prompt
large_text = "The quick brown fox jumps over the lazy dog. " * 500
prompts = [large_text]
token_callback.on_llm_start({}, prompts)
# Create mock response indicating near-limit token usage
mock_response = Mock()
mock_response.llm_output = None
mock_response.generations = [[Mock()]]
mock_response.generations[0][0].message = Mock()
mock_response.generations[0][0].message.usage_metadata = None
mock_response.generations[0][0].message.response_metadata = {
"prompt_eval_count": 1950, # Near the 2048 limit (95%)
"eval_count": 50,
"total_duration": 5000000000, # 5 seconds
"prompt_eval_duration": 4000000000,
"eval_duration": 1000000000,
}
# Process response
token_callback.on_llm_end(mock_response)
# Verify overflow detected
assert token_callback.context_truncated is True
assert token_callback.tokens_truncated > 0 # Should estimate truncation
assert token_callback.truncation_ratio > 0
assert token_callback.ollama_metrics["prompt_eval_count"] == 1950
def test_ollama_raw_metrics_capture(self, token_callback):
"""Test that raw Ollama metrics are properly captured."""
prompts = ["Test prompt"]
token_callback.on_llm_start({}, prompts)
# Create mock response with full Ollama metrics
mock_response = Mock()
mock_response.llm_output = None
mock_response.generations = [[Mock()]]
mock_response.generations[0][0].message = Mock()
mock_response.generations[0][0].message.usage_metadata = None
mock_response.generations[0][0].message.response_metadata = {
"prompt_eval_count": 100,
"eval_count": 200,
"total_duration": 3000000000,
"load_duration": 500000000,
"prompt_eval_duration": 2000000000,
"eval_duration": 500000000,
}
# Process response
token_callback.on_llm_end(mock_response)
# Verify all metrics captured
assert token_callback.ollama_metrics["prompt_eval_count"] == 100
assert token_callback.ollama_metrics["eval_count"] == 200
assert token_callback.ollama_metrics["total_duration"] == 3000000000
assert token_callback.ollama_metrics["load_duration"] == 500000000
assert (
token_callback.ollama_metrics["prompt_eval_duration"] == 2000000000
)
assert token_callback.ollama_metrics["eval_duration"] == 500000000
def test_context_limit_from_research_context(self):
"""Test that context limit is properly read from research context."""
# Create callback with context limit
callback = TokenCountingCallback("test-id", {"context_limit": 2048})
# Context limit is set on llm_start
callback.on_llm_start({}, ["test"])
assert callback.context_limit == 2048
# Test with different limit
callback_4k = TokenCountingCallback("test-id", {"context_limit": 4096})
callback_4k.on_llm_start({}, ["test"])
assert callback_4k.context_limit == 4096
# Test with no limit
callback_no_limit = TokenCountingCallback("test-id", {})
callback_no_limit.on_llm_start({}, ["test"])
assert callback_no_limit.context_limit is None
def test_prompt_size_estimation(self, token_callback):
"""Test that prompt size is estimated correctly."""
# Test with single prompt
prompts = ["This is a test prompt with approximately 10 words."]
token_callback.on_llm_start({}, prompts)
# Rough estimate: ~4 chars per token
expected_tokens = len(prompts[0]) // 4
assert (
abs(token_callback.original_prompt_estimate - expected_tokens) < 5
)
# Test with multiple prompts
token_callback.original_prompt_estimate = 0
prompts = ["First prompt.", "Second prompt.", "Third prompt."]
total_chars = sum(len(p) for p in prompts)
token_callback.on_llm_start({}, prompts)
expected_tokens = total_chars // 4
assert (
abs(token_callback.original_prompt_estimate - expected_tokens) < 5
)
@patch("local_deep_research.metrics.token_counter.logger")
def test_overflow_warning_logged(self, mock_logger, token_callback):
"""Test that overflow detection logs a warning."""
# Create large prompt
large_text = "word " * 10000
prompts = [large_text]
token_callback.on_llm_start({}, prompts)
# Mock response at context limit
mock_response = Mock()
mock_response.llm_output = None
mock_response.generations = [[Mock()]]
mock_response.generations[0][0].message = Mock()
mock_response.generations[0][0].message.usage_metadata = None
mock_response.generations[0][0].message.response_metadata = {
"prompt_eval_count": 2000, # At 95% of 2048 limit
"eval_count": 10,
}
token_callback.on_llm_end(mock_response)
# Verify warning was logged with structured fields
mock_logger.warning.assert_called()
# The new logger.warning is called with multiple string args that get
# concatenated by f-strings; inspect each positional arg for the fields.
warning_call = " ".join(
str(a) for a in mock_logger.warning.call_args[0]
)
assert "Context overflow detected" in warning_call
assert "[provider-confirmed]" in warning_call
assert "prompt_tokens=2000" in warning_call
assert "context_limit=2048" in warning_call
@patch("local_deep_research.metrics.token_counter.logger")
def test_estimated_overflow_for_non_ollama_provider(
self, mock_logger, token_callback
):
"""Detection fires from prompt estimate when provider doesn't echo prompt_eval_count."""
# Use preset_model/preset_provider so on_llm_start initializes by_model.
token_callback.preset_model = "gpt-4"
token_callback.preset_provider = "openai"
# Large prompt: ~10000 tokens (well over 2048 context_limit)
large_text = "word " * 10000
token_callback.on_llm_start({}, [large_text])
# OpenAI-style response: usage_metadata, no prompt_eval_count
mock_response = Mock()
mock_response.llm_output = None
mock_response.generations = [[Mock()]]
mock_response.generations[0][0].message = Mock()
mock_response.generations[0][0].message.usage_metadata = {
"input_tokens": 100,
"output_tokens": 50,
"total_tokens": 150,
}
mock_response.generations[0][0].message.response_metadata = {}
token_callback.on_llm_end(mock_response)
# Estimation path should mark truncation and log
assert token_callback.context_truncated is True
assert token_callback.tokens_truncated > 0
mock_logger.warning.assert_called()
warning_call = " ".join(
str(a) for a in mock_logger.warning.call_args[0]
)
assert "[estimated]" in warning_call
assert "provider=openai" in warning_call
assert "context_limit=2048" in warning_call
@patch("local_deep_research.metrics.token_counter.logger")
def test_provider_confirmed_total_context_overflow(
self, mock_logger, token_callback
):
"""[total-context] fires when input+output exceeds limit but input alone doesn't.
Input must stay strictly under the input-only threshold (80% of
context_limit, set by PR #3792 / #3840) so the input-only branch
does NOT fire and we exercise the total-context elif path.
"""
large_text = "word " * 500
token_callback.on_llm_start({}, [large_text])
# Input is below 80% of 2048 (1638); but input + output >= 95% of 2048 (1945).
mock_response = Mock()
mock_response.llm_output = None
mock_response.generations = [[Mock()]]
mock_response.generations[0][0].message = Mock()
mock_response.generations[0][0].message.usage_metadata = None
mock_response.generations[0][0].message.response_metadata = {
"prompt_eval_count": 1500, # 73% of 2048 — below 80% input-only threshold
"eval_count": 500, # 1500+500=2000 >= 95% of 2048 (1945)
"total_duration": 3000000000,
}
token_callback.on_llm_end(mock_response)
assert token_callback.context_truncated is True
mock_logger.warning.assert_called()
warning_call = " ".join(
str(a) for a in mock_logger.warning.call_args[0]
)
assert "[total-context]" in warning_call
assert "prompt_tokens=1500" in warning_call
assert "completion_tokens=500" in warning_call
assert "total_tokens=2000" in warning_call
assert "context_limit=2048" in warning_call
@patch("local_deep_research.metrics.token_counter.logger")
def test_estimated_total_context_overflow_for_non_ollama(
self, mock_logger, token_callback
):
"""[total-context] fires when hosted provider input+output exceeds limit.
Input alone stays below 80% of 2048 (input-only threshold, set by
#3792/#3840) so the elif total-context branch is exercised.
"""
token_callback.preset_model = "gpt-4"
token_callback.preset_provider = "openai"
# Prompt estimate below context_limit but not by much
medium_text = "word " * 1500 # ~1875 estimated tokens
token_callback.on_llm_start({}, [medium_text])
# Response reports actual tokens: input below 80% but input+output >= 95%.
mock_response = Mock()
mock_response.llm_output = None
mock_response.generations = [[Mock()]]
mock_response.generations[0][0].message = Mock()
mock_response.generations[0][0].message.usage_metadata = {
"input_tokens": 1500, # 73% of 2048 — below 80% input-only threshold
"output_tokens": 600, # 1500+600=2100 > 2048 (so tokens_truncated > 0)
"total_tokens": 2100,
}
mock_response.generations[0][0].message.response_metadata = {}
token_callback.on_llm_end(mock_response)
assert token_callback.context_truncated is True
assert token_callback.tokens_truncated > 0
mock_logger.warning.assert_called()
warning_call = " ".join(
str(a) for a in mock_logger.warning.call_args[0]
)
assert "[total-context]" in warning_call
assert "prompt_tokens=1500" in warning_call
assert "completion_tokens=600" in warning_call
assert "total_tokens=2100" in warning_call
assert "context_limit=2048" in warning_call
@patch("local_deep_research.metrics.token_counter.logger")
def test_estimated_total_context_overflow_via_llm_output(
self, mock_logger, token_callback
):
"""[estimated-total-context] fires when token_usage comes from llm_output.
Distinct from the [total-context] path: this exercises the post-loop
estimation block (token_counter.py ~line 440) where token_usage is
sourced from response.llm_output rather than from the per-generation
usage_metadata that _check_context_overflow consumes.
Conditions for this path:
- token_usage populated from llm_output (not from generations)
- generations have NO usage_metadata AND NO response_metadata
(so _check_context_overflow is never called)
- original_prompt_estimate <= context_limit (so the [estimated]
input-only branch above doesn't fire)
- prompt_tokens + completion_tokens >= 95% of context_limit
"""
token_callback.preset_model = "gpt-4"
token_callback.preset_provider = "openai"
# Estimate well below context_limit — input-only [estimated] stays quiet.
small_text = "word " * 100 # ~125 estimated tokens
token_callback.on_llm_start({}, [small_text])
# token_usage flows in via llm_output; generations carry no metadata
# so _check_context_overflow is never called.
mock_response = Mock()
mock_response.llm_output = {
"token_usage": {
"prompt_tokens": 1500, # 73% of 2048 — below 80%
"completion_tokens": 600, # 1500+600=2100, past context_limit
"total_tokens": 2100,
}
}
# Empty generations list → loop never enters _check_context_overflow.
mock_response.generations = []
token_callback.on_llm_end(mock_response)
assert token_callback.context_truncated is True
assert token_callback.tokens_truncated > 0
mock_logger.warning.assert_called()
warning_call = " ".join(
str(a) for a in mock_logger.warning.call_args[0]
)
assert "[estimated-total-context]" in warning_call
assert "prompt_tokens=1500" in warning_call
assert "completion_tokens=600" in warning_call
assert "total_tokens=2100" in warning_call
assert "context_limit=2048" in warning_call
@patch("local_deep_research.metrics.token_counter.logger")
def test_estimated_path_fires_on_subsequent_calls_after_first_truncation(
self, mock_logger, token_callback
):
"""Regression: TokenCountingCallback is reused across LLM calls in a
research session (see config/llm_config.py wrap_llm). on_llm_start
must reset context_truncated/tokens_truncated/truncation_ratio,
otherwise the post-loop estimation block's `if not context_truncated`
guard silently disables [estimated] / [estimated-total-context] for
every call after the first one that truncates.
"""
token_callback.preset_model = "gpt-4"
token_callback.preset_provider = "openai"
def llm_output_overflow_response():
mock_response = Mock()
mock_response.llm_output = {
"token_usage": {
"prompt_tokens": 1500,
"completion_tokens": 600,
"total_tokens": 2100,
}
}
mock_response.generations = []
return mock_response
# Call 1: triggers [estimated-total-context] → context_truncated=True.
token_callback.on_llm_start({}, ["word " * 100])
token_callback.on_llm_end(llm_output_overflow_response())
first_call_count = mock_logger.warning.call_count
assert first_call_count >= 1
assert token_callback.context_truncated is True
# Call 2: same overflow shape — must ALSO log, not be silenced by
# leftover state from Call 1.
token_callback.on_llm_start({}, ["word " * 100])
# Reset asserted by the second on_llm_start clearing prior state:
assert token_callback.context_truncated is False
token_callback.on_llm_end(llm_output_overflow_response())
assert mock_logger.warning.call_count > first_call_count
assert token_callback.context_truncated is True
@patch("local_deep_research.metrics.token_counter.logger")
def test_estimated_overflow_skipped_when_context_limit_none(
self, mock_logger, token_callback
):
"""Estimation path should not fire when context_limit is not set."""
token_callback.preset_model = "gpt-4"
token_callback.preset_provider = "openai"
# Remove context_limit
token_callback.research_context = {}
large_text = "word " * 10000
token_callback.on_llm_start({}, [large_text])
mock_response = Mock()
mock_response.llm_output = None
mock_response.generations = [[Mock()]]
mock_response.generations[0][0].message = Mock()
mock_response.generations[0][0].message.usage_metadata = {
"input_tokens": 100,
"output_tokens": 50,
"total_tokens": 150,
}
mock_response.generations[0][0].message.response_metadata = {}
token_callback.on_llm_end(mock_response)
assert token_callback.context_truncated is False
@patch("local_deep_research.metrics.token_counter.logger")
def test_estimated_overflow_zero_prompt_estimate(
self, mock_logger, token_callback
):
"""Estimation path should not fire when prompt estimate is 0."""
token_callback.preset_model = "gpt-4"
token_callback.preset_provider = "openai"
token_callback.original_prompt_estimate = 0
mock_response = Mock()
mock_response.llm_output = None
mock_response.generations = [[Mock()]]
mock_response.generations[0][0].message = Mock()
mock_response.generations[0][0].message.usage_metadata = {
"input_tokens": 100,
"output_tokens": 50,
"total_tokens": 150,
}
mock_response.generations[0][0].message.response_metadata = {}
token_callback.on_llm_end(mock_response)
assert token_callback.context_truncated is False
assert token_callback.tokens_truncated == 0
@patch("local_deep_research.metrics.token_counter.logger")
def test_estimated_overflow_without_on_llm_start(
self, mock_logger, token_callback
):
"""Estimation path should not crash if on_llm_start was never called."""
# Don't call on_llm_start — fields stay at defaults
mock_response = Mock()
mock_response.llm_output = None
mock_response.generations = [[Mock()]]
mock_response.generations[0][0].message = Mock()
mock_response.generations[0][0].message.usage_metadata = {
"input_tokens": 100,
"output_tokens": 50,
"total_tokens": 150,
}
mock_response.generations[0][0].message.response_metadata = {}
# Should not raise and should not mark truncation
token_callback.on_llm_end(mock_response)
assert token_callback.context_truncated is False
@pytest.mark.skipif(
os.environ.get("SKIP_OLLAMA_TESTS", "true").lower() == "true",
reason="Ollama integration tests skipped",
)
class TestContextOverflowIntegration:
"""Integration tests with actual Ollama (when available)."""
@pytest.mark.slow
def test_ollama_context_overflow_real(self):
"""Test with real Ollama instance if available."""
from langchain_ollama import ChatOllama
from local_deep_research.config.llm_config import (
is_ollama_available,
)
if not is_ollama_available():
pytest.skip("Ollama not available")
# Create LLM with small context window
llm = ChatOllama(
model="llama3.2:latest",
num_ctx=512, # Very small context for testing
temperature=0.1,
)
# Create callback
research_id = str(uuid.uuid4())
callback = TokenCountingCallback(research_id, {"context_limit": 512})
# Create prompt that will likely overflow
large_prompt = "Please analyze this text: " + ("word " * 200)
# Run with callback
try:
_ = llm.invoke(large_prompt, config={"callbacks": [callback]})
# Check if overflow was detected
if callback.ollama_metrics.get("prompt_eval_count"):
prompt_tokens = callback.ollama_metrics["prompt_eval_count"]
if prompt_tokens >= 512 * 0.80:
assert callback.context_truncated is True
print(f"✅ Overflow detected: {prompt_tokens}/512 tokens")
else:
print(f" No overflow: {prompt_tokens}/512 tokens")
except Exception as e:
pytest.skip(f"Ollama test failed: {e}")