Files
local-deep-research/tests/test_wikipedia_url_security.py
LearningCircuit d9bc003094 fix: resolve test failures and fix import paths in test suite
- Bulk fix import paths from src.local_deep_research to local_deep_research
  across 190+ test files
- Fix test_settings_routes.py: update validate_setting tests to use proper
  Setting objects instead of incorrect parameter signature
- Fix test_research_service.py: remove tests for non-existent functions,
  correct mimetype assertions for latex export
- Fix test_app_factory.py: use truly public IP (8.8.8.8) instead of
  TEST-NET reserved range for IP validation test
- Fix test_api_functions.py: correct patch paths for get_user_db_session,
  has_request_context, and get_settings_manager
- Fix test_news_api.py: correct patch paths for database session

Test results: 8592 passing, 175 failing (down from 286), 115 skipped
Remaining failures are test isolation issues that pass individually.
2026-01-21 20:50:48 +01:00

166 lines
5.7 KiB
Python

"""
Test for Wikipedia URL validation security fix.
This addresses the security issue mentioned in PR #361 review.
"""
from tests.fixtures.search_engine_mocks import validate_wikipedia_url
class TestWikipediaURLSecurity:
"""Test Wikipedia URL validation for security."""
def test_valid_wikipedia_urls(self):
"""Test that valid Wikipedia URLs are accepted."""
valid_urls = [
"https://en.wikipedia.org/wiki/Python",
"https://wikipedia.org/wiki/Main_Page",
"https://fr.wikipedia.org/wiki/Accueil",
"https://de.wikipedia.org/wiki/Deutschland",
"https://es.wikipedia.org/wiki/España",
"https://ja.wikipedia.org/wiki/日本",
"https://zh.wikipedia.org/wiki/中国",
"https://simple.wikipedia.org/wiki/Computer",
"https://meta.wikipedia.org/wiki/Main_Page",
]
for url in valid_urls:
assert validate_wikipedia_url(url) is True, (
f"Valid URL rejected: {url}"
)
def test_invalid_wikipedia_urls(self):
"""Test that invalid/malicious URLs are rejected."""
invalid_urls = [
# Non-Wikipedia domains
"https://fakepedia.org/wiki/Test",
"https://wikipedia.com/wiki/Test", # .com not .org
"https://wikipediaorg.com/wiki/Test",
"https://en-wikipedia.org/wiki/Test", # Hyphenated domain
# Malicious attempts
"https://wikipedia.org.evil.com/wiki/Test",
"https://evil.com/wikipedia.org/wiki/Test",
"https://wikipedia.org@evil.com/wiki/Test",
"https://wikipedia.org.evil.com",
# Invalid URLs
"not-a-url",
"ftp://wikipedia.org/wiki/Test",
"javascript:alert('xss')",
"file:///etc/passwd",
"",
None,
# Edge cases
"https://wikipediaXorg/wiki/Test",
"https://wikipedia_org/wiki/Test",
"https://wikipedia..org/wiki/Test",
]
for url in invalid_urls:
assert validate_wikipedia_url(url) is False, (
f"Invalid URL accepted: {url}"
)
def test_url_validation_handles_exceptions(self):
"""Test that URL validation handles exceptions gracefully."""
# Test with various problematic inputs
problematic_inputs = [
123, # Integer
[], # List
{}, # Dict
object(), # Generic object
float("inf"), # Infinity
float("nan"), # NaN
]
for input_val in problematic_inputs:
# Should return False without raising exception
assert validate_wikipedia_url(input_val) is False
def test_wikipedia_search_uses_validation(self, monkeypatch):
"""Test that Wikipedia search engine uses URL validation."""
from tests.test_utils import add_src_to_path
add_src_to_path()
# Import after adding src to path
from local_deep_research.web_search_engines.engines.search_engine_wikipedia import (
WikipediaSearchEngine,
)
# The WikipediaSearchEngine should construct safe URLs internally
# It should not use user-provided URLs directly
search = WikipediaSearchEngine(max_results=1)
# Mock the wikipedia module's search method
import wikipedia
def mock_search(*args, **kwargs):
return ["Test Article"]
monkeypatch.setattr(wikipedia, "search", mock_search)
# Mock wikipedia.summary to avoid actual API calls
def mock_summary(title, *args, **kwargs):
return "Test summary for the article."
monkeypatch.setattr(wikipedia, "summary", mock_summary)
# Get previews (this is where URLs are constructed)
previews = search._get_previews("test query")
# Verify that the URL is properly constructed
if previews:
for preview in previews:
url = preview.get("link", "")
# URL should be a valid Wikipedia URL
assert validate_wikipedia_url(url), (
f"Invalid URL generated: {url}"
)
# URL should follow the expected pattern
assert url.startswith("https://en.wikipedia.org/wiki/"), (
f"Unexpected URL format: {url}"
)
def test_wikipedia_url_in_response_validation():
"""Test that we validate URLs in Wikipedia API responses."""
# This is a conceptual test showing how URL validation should be used
# when processing Wikipedia API responses
def process_wikipedia_response(response_data):
"""Process Wikipedia response with URL validation."""
results = []
for item in response_data.get("results", []):
# If the response contains URLs, validate them
if "url" in item:
if not validate_wikipedia_url(item["url"]):
# Skip items with invalid URLs
continue
results.append(item)
return results
# Test with mixed valid/invalid URLs
test_response = {
"results": [
{
"title": "Valid Article",
"url": "https://en.wikipedia.org/wiki/Valid",
},
{
"title": "Invalid Article",
"url": "https://fake-wikipedia.org/wiki/Invalid",
},
{"title": "No URL Article"}, # No URL field
]
}
processed = process_wikipedia_response(test_response)
# Should only include valid items
assert len(processed) == 2
assert processed[0]["title"] == "Valid Article"
assert processed[1]["title"] == "No URL Article"