mirror of
https://github.com/LearningCircuit/local-deep-research.git
synced 2026-06-16 03:51:07 +03:00
PUNCHLIST Tier 2 (no-assertions) cleanup. Each removed test had a
test name that promised coverage of a specific behavior but the body
contained no assertion exercising that behavior — they were silent
no-ops that always passed.
Removed
-------
1. tests/test_api_settings_advanced.py — test_web_api_still_uses_database
Body ends with `pass # Placeholder for actual web API testing`.
2. tests/test_api_settings_e2e.py — test_provider_fallback_scenario
Sets up mocks + side_effect closure, builds a snapshot, never calls
anything that exercises the fallback path. No assert.
3. tests/test_link_analytics.py — test_get_link_analytics_source_categorization
Calls get_link_analytics() and exits. Trailing comment admits:
"actual categorization is done by DomainClassifier ... so we can't
assert exact counts". No assertion on the actual categorization.
4. tests/web/routes/test_context_overflow_api.py — test_get_metrics_no_username
Patches login_required, opens an app context, body ends with `pass`.
Was supposed to verify 401-when-no-username; verifies nothing.
5. tests/test_context_overflow_detection.py — test_token_usage_save_with_overflow_fields
First line is `pytest.skip("Requires Flask application context")` —
everything below is dead code. The test never runs and never has.
6. tests/api_tests/test_api_pytest.py — test_cors_headers, test_rate_limiting_headers
Both only assert health endpoint returns 200. CORS / rate-limit
header checks are commented-out TODOs. Test names lie about coverage.
7. tests/api_tests/test_rest_api.py — test_cors_headers
Same anti-pattern. Asserts health returns 200, then prints a fake
success line. No CORS verification.
542 lines
22 KiB
Python
542 lines
22 KiB
Python
"""Test context overflow detection for LLM calls."""
|
||
|
||
import os
|
||
import uuid
|
||
from unittest.mock import Mock, patch
|
||
|
||
import pytest
|
||
from sqlalchemy import create_engine
|
||
from sqlalchemy.orm import sessionmaker
|
||
|
||
from local_deep_research.database.models import Base
|
||
from local_deep_research.metrics.token_counter import TokenCountingCallback
|
||
|
||
|
||
class TestContextOverflowDetection:
|
||
"""Test suite for context overflow detection."""
|
||
|
||
@pytest.fixture
|
||
def db_session(self):
|
||
"""Create an in-memory database session for testing."""
|
||
engine = create_engine("sqlite:///:memory:")
|
||
Base.metadata.create_all(engine)
|
||
Session = sessionmaker(bind=engine)
|
||
session = Session()
|
||
yield session
|
||
session.close()
|
||
engine.dispose()
|
||
|
||
@pytest.fixture
|
||
def token_callback(self):
|
||
"""Create a token counting callback for testing."""
|
||
research_id = str(uuid.uuid4())
|
||
research_context = {
|
||
"research_query": "Test query",
|
||
"research_mode": "test",
|
||
"context_limit": 2048, # Set a specific context limit
|
||
"username": "test_user",
|
||
"user_password": "test_pass",
|
||
}
|
||
return TokenCountingCallback(research_id, research_context)
|
||
|
||
def test_context_overflow_detection_no_overflow(self, token_callback):
|
||
"""Test that no overflow is detected for small prompts."""
|
||
# Simulate LLM start with small prompt
|
||
prompts = ["What is 2+2?"]
|
||
token_callback.on_llm_start({}, prompts)
|
||
|
||
# Create mock response with Ollama-style metadata
|
||
mock_response = Mock()
|
||
mock_response.llm_output = None # Explicitly set to None
|
||
mock_response.generations = [[Mock()]]
|
||
mock_response.generations[0][0].message = Mock()
|
||
mock_response.generations[0][
|
||
0
|
||
].message.usage_metadata = None # No usage_metadata
|
||
mock_response.generations[0][0].message.response_metadata = {
|
||
"prompt_eval_count": 5, # Small token count
|
||
"eval_count": 10,
|
||
"total_duration": 1000000000, # 1 second in nanoseconds
|
||
}
|
||
|
||
# Process response
|
||
token_callback.on_llm_end(mock_response)
|
||
|
||
# Verify no overflow detected
|
||
assert token_callback.context_truncated is False
|
||
assert token_callback.tokens_truncated == 0
|
||
assert token_callback.truncation_ratio == 0.0
|
||
assert token_callback.ollama_metrics.get("prompt_eval_count") == 5
|
||
|
||
def test_context_overflow_detection_with_overflow(self, token_callback):
|
||
"""Test that overflow is detected when prompt approaches context limit."""
|
||
# Simulate LLM start with large prompt
|
||
large_text = "The quick brown fox jumps over the lazy dog. " * 500
|
||
prompts = [large_text]
|
||
token_callback.on_llm_start({}, prompts)
|
||
|
||
# Create mock response indicating near-limit token usage
|
||
mock_response = Mock()
|
||
mock_response.llm_output = None
|
||
mock_response.generations = [[Mock()]]
|
||
mock_response.generations[0][0].message = Mock()
|
||
mock_response.generations[0][0].message.usage_metadata = None
|
||
mock_response.generations[0][0].message.response_metadata = {
|
||
"prompt_eval_count": 1950, # Near the 2048 limit (95%)
|
||
"eval_count": 50,
|
||
"total_duration": 5000000000, # 5 seconds
|
||
"prompt_eval_duration": 4000000000,
|
||
"eval_duration": 1000000000,
|
||
}
|
||
|
||
# Process response
|
||
token_callback.on_llm_end(mock_response)
|
||
|
||
# Verify overflow detected
|
||
assert token_callback.context_truncated is True
|
||
assert token_callback.tokens_truncated > 0 # Should estimate truncation
|
||
assert token_callback.truncation_ratio > 0
|
||
assert token_callback.ollama_metrics["prompt_eval_count"] == 1950
|
||
|
||
def test_ollama_raw_metrics_capture(self, token_callback):
|
||
"""Test that raw Ollama metrics are properly captured."""
|
||
prompts = ["Test prompt"]
|
||
token_callback.on_llm_start({}, prompts)
|
||
|
||
# Create mock response with full Ollama metrics
|
||
mock_response = Mock()
|
||
mock_response.llm_output = None
|
||
mock_response.generations = [[Mock()]]
|
||
mock_response.generations[0][0].message = Mock()
|
||
mock_response.generations[0][0].message.usage_metadata = None
|
||
mock_response.generations[0][0].message.response_metadata = {
|
||
"prompt_eval_count": 100,
|
||
"eval_count": 200,
|
||
"total_duration": 3000000000,
|
||
"load_duration": 500000000,
|
||
"prompt_eval_duration": 2000000000,
|
||
"eval_duration": 500000000,
|
||
}
|
||
|
||
# Process response
|
||
token_callback.on_llm_end(mock_response)
|
||
|
||
# Verify all metrics captured
|
||
assert token_callback.ollama_metrics["prompt_eval_count"] == 100
|
||
assert token_callback.ollama_metrics["eval_count"] == 200
|
||
assert token_callback.ollama_metrics["total_duration"] == 3000000000
|
||
assert token_callback.ollama_metrics["load_duration"] == 500000000
|
||
assert (
|
||
token_callback.ollama_metrics["prompt_eval_duration"] == 2000000000
|
||
)
|
||
assert token_callback.ollama_metrics["eval_duration"] == 500000000
|
||
|
||
def test_context_limit_from_research_context(self):
|
||
"""Test that context limit is properly read from research context."""
|
||
# Create callback with context limit
|
||
callback = TokenCountingCallback("test-id", {"context_limit": 2048})
|
||
# Context limit is set on llm_start
|
||
callback.on_llm_start({}, ["test"])
|
||
assert callback.context_limit == 2048
|
||
|
||
# Test with different limit
|
||
callback_4k = TokenCountingCallback("test-id", {"context_limit": 4096})
|
||
callback_4k.on_llm_start({}, ["test"])
|
||
assert callback_4k.context_limit == 4096
|
||
|
||
# Test with no limit
|
||
callback_no_limit = TokenCountingCallback("test-id", {})
|
||
callback_no_limit.on_llm_start({}, ["test"])
|
||
assert callback_no_limit.context_limit is None
|
||
|
||
def test_prompt_size_estimation(self, token_callback):
|
||
"""Test that prompt size is estimated correctly."""
|
||
# Test with single prompt
|
||
prompts = ["This is a test prompt with approximately 10 words."]
|
||
token_callback.on_llm_start({}, prompts)
|
||
|
||
# Rough estimate: ~4 chars per token
|
||
expected_tokens = len(prompts[0]) // 4
|
||
assert (
|
||
abs(token_callback.original_prompt_estimate - expected_tokens) < 5
|
||
)
|
||
|
||
# Test with multiple prompts
|
||
token_callback.original_prompt_estimate = 0
|
||
prompts = ["First prompt.", "Second prompt.", "Third prompt."]
|
||
total_chars = sum(len(p) for p in prompts)
|
||
token_callback.on_llm_start({}, prompts)
|
||
|
||
expected_tokens = total_chars // 4
|
||
assert (
|
||
abs(token_callback.original_prompt_estimate - expected_tokens) < 5
|
||
)
|
||
|
||
@patch("local_deep_research.metrics.token_counter.logger")
|
||
def test_overflow_warning_logged(self, mock_logger, token_callback):
|
||
"""Test that overflow detection logs a warning."""
|
||
# Create large prompt
|
||
large_text = "word " * 10000
|
||
prompts = [large_text]
|
||
token_callback.on_llm_start({}, prompts)
|
||
|
||
# Mock response at context limit
|
||
mock_response = Mock()
|
||
mock_response.llm_output = None
|
||
mock_response.generations = [[Mock()]]
|
||
mock_response.generations[0][0].message = Mock()
|
||
mock_response.generations[0][0].message.usage_metadata = None
|
||
mock_response.generations[0][0].message.response_metadata = {
|
||
"prompt_eval_count": 2000, # At 95% of 2048 limit
|
||
"eval_count": 10,
|
||
}
|
||
|
||
token_callback.on_llm_end(mock_response)
|
||
|
||
# Verify warning was logged with structured fields
|
||
mock_logger.warning.assert_called()
|
||
# The new logger.warning is called with multiple string args that get
|
||
# concatenated by f-strings; inspect each positional arg for the fields.
|
||
warning_call = " ".join(
|
||
str(a) for a in mock_logger.warning.call_args[0]
|
||
)
|
||
assert "Context overflow detected" in warning_call
|
||
assert "[provider-confirmed]" in warning_call
|
||
assert "prompt_tokens=2000" in warning_call
|
||
assert "context_limit=2048" in warning_call
|
||
|
||
@patch("local_deep_research.metrics.token_counter.logger")
|
||
def test_estimated_overflow_for_non_ollama_provider(
|
||
self, mock_logger, token_callback
|
||
):
|
||
"""Detection fires from prompt estimate when provider doesn't echo prompt_eval_count."""
|
||
# Use preset_model/preset_provider so on_llm_start initializes by_model.
|
||
token_callback.preset_model = "gpt-4"
|
||
token_callback.preset_provider = "openai"
|
||
|
||
# Large prompt: ~10000 tokens (well over 2048 context_limit)
|
||
large_text = "word " * 10000
|
||
token_callback.on_llm_start({}, [large_text])
|
||
|
||
# OpenAI-style response: usage_metadata, no prompt_eval_count
|
||
mock_response = Mock()
|
||
mock_response.llm_output = None
|
||
mock_response.generations = [[Mock()]]
|
||
mock_response.generations[0][0].message = Mock()
|
||
mock_response.generations[0][0].message.usage_metadata = {
|
||
"input_tokens": 100,
|
||
"output_tokens": 50,
|
||
"total_tokens": 150,
|
||
}
|
||
mock_response.generations[0][0].message.response_metadata = {}
|
||
|
||
token_callback.on_llm_end(mock_response)
|
||
|
||
# Estimation path should mark truncation and log
|
||
assert token_callback.context_truncated is True
|
||
assert token_callback.tokens_truncated > 0
|
||
mock_logger.warning.assert_called()
|
||
warning_call = " ".join(
|
||
str(a) for a in mock_logger.warning.call_args[0]
|
||
)
|
||
assert "[estimated]" in warning_call
|
||
assert "provider=openai" in warning_call
|
||
assert "context_limit=2048" in warning_call
|
||
|
||
@patch("local_deep_research.metrics.token_counter.logger")
|
||
def test_provider_confirmed_total_context_overflow(
|
||
self, mock_logger, token_callback
|
||
):
|
||
"""[total-context] fires when input+output exceeds limit but input alone doesn't.
|
||
|
||
Input must stay strictly under the input-only threshold (80% of
|
||
context_limit, set by PR #3792 / #3840) so the input-only branch
|
||
does NOT fire and we exercise the total-context elif path.
|
||
"""
|
||
large_text = "word " * 500
|
||
token_callback.on_llm_start({}, [large_text])
|
||
|
||
# Input is below 80% of 2048 (1638); but input + output >= 95% of 2048 (1945).
|
||
mock_response = Mock()
|
||
mock_response.llm_output = None
|
||
mock_response.generations = [[Mock()]]
|
||
mock_response.generations[0][0].message = Mock()
|
||
mock_response.generations[0][0].message.usage_metadata = None
|
||
mock_response.generations[0][0].message.response_metadata = {
|
||
"prompt_eval_count": 1500, # 73% of 2048 — below 80% input-only threshold
|
||
"eval_count": 500, # 1500+500=2000 >= 95% of 2048 (1945)
|
||
"total_duration": 3000000000,
|
||
}
|
||
|
||
token_callback.on_llm_end(mock_response)
|
||
|
||
assert token_callback.context_truncated is True
|
||
mock_logger.warning.assert_called()
|
||
warning_call = " ".join(
|
||
str(a) for a in mock_logger.warning.call_args[0]
|
||
)
|
||
assert "[total-context]" in warning_call
|
||
assert "prompt_tokens=1500" in warning_call
|
||
assert "completion_tokens=500" in warning_call
|
||
assert "total_tokens=2000" in warning_call
|
||
assert "context_limit=2048" in warning_call
|
||
|
||
@patch("local_deep_research.metrics.token_counter.logger")
|
||
def test_estimated_total_context_overflow_for_non_ollama(
|
||
self, mock_logger, token_callback
|
||
):
|
||
"""[total-context] fires when hosted provider input+output exceeds limit.
|
||
|
||
Input alone stays below 80% of 2048 (input-only threshold, set by
|
||
#3792/#3840) so the elif total-context branch is exercised.
|
||
"""
|
||
token_callback.preset_model = "gpt-4"
|
||
token_callback.preset_provider = "openai"
|
||
|
||
# Prompt estimate below context_limit but not by much
|
||
medium_text = "word " * 1500 # ~1875 estimated tokens
|
||
token_callback.on_llm_start({}, [medium_text])
|
||
|
||
# Response reports actual tokens: input below 80% but input+output >= 95%.
|
||
mock_response = Mock()
|
||
mock_response.llm_output = None
|
||
mock_response.generations = [[Mock()]]
|
||
mock_response.generations[0][0].message = Mock()
|
||
mock_response.generations[0][0].message.usage_metadata = {
|
||
"input_tokens": 1500, # 73% of 2048 — below 80% input-only threshold
|
||
"output_tokens": 600, # 1500+600=2100 > 2048 (so tokens_truncated > 0)
|
||
"total_tokens": 2100,
|
||
}
|
||
mock_response.generations[0][0].message.response_metadata = {}
|
||
|
||
token_callback.on_llm_end(mock_response)
|
||
|
||
assert token_callback.context_truncated is True
|
||
assert token_callback.tokens_truncated > 0
|
||
mock_logger.warning.assert_called()
|
||
warning_call = " ".join(
|
||
str(a) for a in mock_logger.warning.call_args[0]
|
||
)
|
||
assert "[total-context]" in warning_call
|
||
assert "prompt_tokens=1500" in warning_call
|
||
assert "completion_tokens=600" in warning_call
|
||
assert "total_tokens=2100" in warning_call
|
||
assert "context_limit=2048" in warning_call
|
||
|
||
@patch("local_deep_research.metrics.token_counter.logger")
|
||
def test_estimated_total_context_overflow_via_llm_output(
|
||
self, mock_logger, token_callback
|
||
):
|
||
"""[estimated-total-context] fires when token_usage comes from llm_output.
|
||
|
||
Distinct from the [total-context] path: this exercises the post-loop
|
||
estimation block (token_counter.py ~line 440) where token_usage is
|
||
sourced from response.llm_output rather than from the per-generation
|
||
usage_metadata that _check_context_overflow consumes.
|
||
|
||
Conditions for this path:
|
||
- token_usage populated from llm_output (not from generations)
|
||
- generations have NO usage_metadata AND NO response_metadata
|
||
(so _check_context_overflow is never called)
|
||
- original_prompt_estimate <= context_limit (so the [estimated]
|
||
input-only branch above doesn't fire)
|
||
- prompt_tokens + completion_tokens >= 95% of context_limit
|
||
"""
|
||
token_callback.preset_model = "gpt-4"
|
||
token_callback.preset_provider = "openai"
|
||
|
||
# Estimate well below context_limit — input-only [estimated] stays quiet.
|
||
small_text = "word " * 100 # ~125 estimated tokens
|
||
token_callback.on_llm_start({}, [small_text])
|
||
|
||
# token_usage flows in via llm_output; generations carry no metadata
|
||
# so _check_context_overflow is never called.
|
||
mock_response = Mock()
|
||
mock_response.llm_output = {
|
||
"token_usage": {
|
||
"prompt_tokens": 1500, # 73% of 2048 — below 80%
|
||
"completion_tokens": 600, # 1500+600=2100, past context_limit
|
||
"total_tokens": 2100,
|
||
}
|
||
}
|
||
# Empty generations list → loop never enters _check_context_overflow.
|
||
mock_response.generations = []
|
||
|
||
token_callback.on_llm_end(mock_response)
|
||
|
||
assert token_callback.context_truncated is True
|
||
assert token_callback.tokens_truncated > 0
|
||
mock_logger.warning.assert_called()
|
||
warning_call = " ".join(
|
||
str(a) for a in mock_logger.warning.call_args[0]
|
||
)
|
||
assert "[estimated-total-context]" in warning_call
|
||
assert "prompt_tokens=1500" in warning_call
|
||
assert "completion_tokens=600" in warning_call
|
||
assert "total_tokens=2100" in warning_call
|
||
assert "context_limit=2048" in warning_call
|
||
|
||
@patch("local_deep_research.metrics.token_counter.logger")
|
||
def test_estimated_path_fires_on_subsequent_calls_after_first_truncation(
|
||
self, mock_logger, token_callback
|
||
):
|
||
"""Regression: TokenCountingCallback is reused across LLM calls in a
|
||
research session (see config/llm_config.py wrap_llm). on_llm_start
|
||
must reset context_truncated/tokens_truncated/truncation_ratio,
|
||
otherwise the post-loop estimation block's `if not context_truncated`
|
||
guard silently disables [estimated] / [estimated-total-context] for
|
||
every call after the first one that truncates.
|
||
"""
|
||
token_callback.preset_model = "gpt-4"
|
||
token_callback.preset_provider = "openai"
|
||
|
||
def llm_output_overflow_response():
|
||
mock_response = Mock()
|
||
mock_response.llm_output = {
|
||
"token_usage": {
|
||
"prompt_tokens": 1500,
|
||
"completion_tokens": 600,
|
||
"total_tokens": 2100,
|
||
}
|
||
}
|
||
mock_response.generations = []
|
||
return mock_response
|
||
|
||
# Call 1: triggers [estimated-total-context] → context_truncated=True.
|
||
token_callback.on_llm_start({}, ["word " * 100])
|
||
token_callback.on_llm_end(llm_output_overflow_response())
|
||
first_call_count = mock_logger.warning.call_count
|
||
assert first_call_count >= 1
|
||
assert token_callback.context_truncated is True
|
||
|
||
# Call 2: same overflow shape — must ALSO log, not be silenced by
|
||
# leftover state from Call 1.
|
||
token_callback.on_llm_start({}, ["word " * 100])
|
||
# Reset asserted by the second on_llm_start clearing prior state:
|
||
assert token_callback.context_truncated is False
|
||
token_callback.on_llm_end(llm_output_overflow_response())
|
||
assert mock_logger.warning.call_count > first_call_count
|
||
assert token_callback.context_truncated is True
|
||
|
||
@patch("local_deep_research.metrics.token_counter.logger")
|
||
def test_estimated_overflow_skipped_when_context_limit_none(
|
||
self, mock_logger, token_callback
|
||
):
|
||
"""Estimation path should not fire when context_limit is not set."""
|
||
token_callback.preset_model = "gpt-4"
|
||
token_callback.preset_provider = "openai"
|
||
# Remove context_limit
|
||
token_callback.research_context = {}
|
||
|
||
large_text = "word " * 10000
|
||
token_callback.on_llm_start({}, [large_text])
|
||
|
||
mock_response = Mock()
|
||
mock_response.llm_output = None
|
||
mock_response.generations = [[Mock()]]
|
||
mock_response.generations[0][0].message = Mock()
|
||
mock_response.generations[0][0].message.usage_metadata = {
|
||
"input_tokens": 100,
|
||
"output_tokens": 50,
|
||
"total_tokens": 150,
|
||
}
|
||
mock_response.generations[0][0].message.response_metadata = {}
|
||
|
||
token_callback.on_llm_end(mock_response)
|
||
|
||
assert token_callback.context_truncated is False
|
||
|
||
@patch("local_deep_research.metrics.token_counter.logger")
|
||
def test_estimated_overflow_zero_prompt_estimate(
|
||
self, mock_logger, token_callback
|
||
):
|
||
"""Estimation path should not fire when prompt estimate is 0."""
|
||
token_callback.preset_model = "gpt-4"
|
||
token_callback.preset_provider = "openai"
|
||
token_callback.original_prompt_estimate = 0
|
||
|
||
mock_response = Mock()
|
||
mock_response.llm_output = None
|
||
mock_response.generations = [[Mock()]]
|
||
mock_response.generations[0][0].message = Mock()
|
||
mock_response.generations[0][0].message.usage_metadata = {
|
||
"input_tokens": 100,
|
||
"output_tokens": 50,
|
||
"total_tokens": 150,
|
||
}
|
||
mock_response.generations[0][0].message.response_metadata = {}
|
||
|
||
token_callback.on_llm_end(mock_response)
|
||
|
||
assert token_callback.context_truncated is False
|
||
assert token_callback.tokens_truncated == 0
|
||
|
||
@patch("local_deep_research.metrics.token_counter.logger")
|
||
def test_estimated_overflow_without_on_llm_start(
|
||
self, mock_logger, token_callback
|
||
):
|
||
"""Estimation path should not crash if on_llm_start was never called."""
|
||
# Don't call on_llm_start — fields stay at defaults
|
||
mock_response = Mock()
|
||
mock_response.llm_output = None
|
||
mock_response.generations = [[Mock()]]
|
||
mock_response.generations[0][0].message = Mock()
|
||
mock_response.generations[0][0].message.usage_metadata = {
|
||
"input_tokens": 100,
|
||
"output_tokens": 50,
|
||
"total_tokens": 150,
|
||
}
|
||
mock_response.generations[0][0].message.response_metadata = {}
|
||
|
||
# Should not raise and should not mark truncation
|
||
token_callback.on_llm_end(mock_response)
|
||
assert token_callback.context_truncated is False
|
||
|
||
|
||
@pytest.mark.skipif(
|
||
os.environ.get("SKIP_OLLAMA_TESTS", "true").lower() == "true",
|
||
reason="Ollama integration tests skipped",
|
||
)
|
||
class TestContextOverflowIntegration:
|
||
"""Integration tests with actual Ollama (when available)."""
|
||
|
||
@pytest.mark.slow
|
||
def test_ollama_context_overflow_real(self):
|
||
"""Test with real Ollama instance if available."""
|
||
from langchain_ollama import ChatOllama
|
||
from local_deep_research.config.llm_config import (
|
||
is_ollama_available,
|
||
)
|
||
|
||
if not is_ollama_available():
|
||
pytest.skip("Ollama not available")
|
||
|
||
# Create LLM with small context window
|
||
llm = ChatOllama(
|
||
model="llama3.2:latest",
|
||
num_ctx=512, # Very small context for testing
|
||
temperature=0.1,
|
||
)
|
||
|
||
# Create callback
|
||
research_id = str(uuid.uuid4())
|
||
callback = TokenCountingCallback(research_id, {"context_limit": 512})
|
||
|
||
# Create prompt that will likely overflow
|
||
large_prompt = "Please analyze this text: " + ("word " * 200)
|
||
|
||
# Run with callback
|
||
try:
|
||
_ = llm.invoke(large_prompt, config={"callbacks": [callback]})
|
||
|
||
# Check if overflow was detected
|
||
if callback.ollama_metrics.get("prompt_eval_count"):
|
||
prompt_tokens = callback.ollama_metrics["prompt_eval_count"]
|
||
if prompt_tokens >= 512 * 0.80:
|
||
assert callback.context_truncated is True
|
||
print(f"✅ Overflow detected: {prompt_tokens}/512 tokens")
|
||
else:
|
||
print(f"ℹ️ No overflow: {prompt_tokens}/512 tokens")
|
||
except Exception as e:
|
||
pytest.skip(f"Ollama test failed: {e}")
|