- Added `verify_redirect_targets` parameter to control redirect verification. - Modified `_resolve_head()` to verify redirect targets based on the new parameter. - Implemented tests for both verification modes, ensuring dead redirects are filtered out and legacy behavior is preserved.
29 lines
1.3 KiB
Python
29 lines
1.3 KiB
Python
import pytest
|
||
import asyncio
|
||
from crawl4ai.async_url_seeder import AsyncUrlSeeder
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_resolve_head_handles_dead_redirects():
|
||
seeder = AsyncUrlSeeder()
|
||
# Should return None – redirects to a dead URL
|
||
assert await seeder._resolve_head("http://youtube.com/sitemap.xml") is None
|
||
assert await seeder._resolve_head("https://stripe.com/sitemap.xml") is None
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_resolve_head_direct_hit():
|
||
seeder = AsyncUrlSeeder()
|
||
# Test with a known live URL, e.g., httpbin
|
||
result = await seeder._resolve_head("https://httpbin.org/status/200")
|
||
assert result == "https://httpbin.org/status/200"
|
||
|
||
@pytest.mark.asyncio
|
||
async def test_resolve_head_verify_redirect_targets_false():
|
||
# Test with verification disabled - should return redirect target without checking if alive
|
||
seeder = AsyncUrlSeeder(verify_redirect_targets=False)
|
||
# This should return the redirect target even if it's dead (old behavior)
|
||
result = await seeder._resolve_head("http://youtube.com/sitemap.xml")
|
||
# The exact redirect target might vary, but it should not be None
|
||
assert result is not None
|
||
assert isinstance(result, str)
|
||
# Should be different from the input URL (indicating redirect was followed)
|
||
assert result != "http://youtube.com/sitemap.xml" |