Fix redirect target verification in AsyncUrlSeeder and enhance tests

- Added `verify_redirect_targets` parameter to control redirect verification.
- Modified `_resolve_head()` to verify redirect targets based on the new parameter.
- Implemented tests for both verification modes, ensuring dead redirects are filtered out and legacy behavior is preserved.
This commit is contained in:
AHMET YILMAZ
2025-11-18 11:43:47 +08:00
parent c2c4d42be4
commit 43a2088eb0
2 changed files with 81 additions and 8 deletions

View File

@@ -0,0 +1,29 @@
import pytest
import asyncio
from crawl4ai.async_url_seeder import AsyncUrlSeeder
@pytest.mark.asyncio
async def test_resolve_head_handles_dead_redirects():
seeder = AsyncUrlSeeder()
# Should return None redirects to a dead URL
assert await seeder._resolve_head("http://youtube.com/sitemap.xml") is None
assert await seeder._resolve_head("https://stripe.com/sitemap.xml") is None
@pytest.mark.asyncio
async def test_resolve_head_direct_hit():
seeder = AsyncUrlSeeder()
# Test with a known live URL, e.g., httpbin
result = await seeder._resolve_head("https://httpbin.org/status/200")
assert result == "https://httpbin.org/status/200"
@pytest.mark.asyncio
async def test_resolve_head_verify_redirect_targets_false():
# Test with verification disabled - should return redirect target without checking if alive
seeder = AsyncUrlSeeder(verify_redirect_targets=False)
# This should return the redirect target even if it's dead (old behavior)
result = await seeder._resolve_head("http://youtube.com/sitemap.xml")
# The exact redirect target might vary, but it should not be None
assert result is not None
assert isinstance(result, str)
# Should be different from the input URL (indicating redirect was followed)
assert result != "http://youtube.com/sitemap.xml"