Files
crawl4ai/tests/test_async_url_seeder.py
AHMET YILMAZ 43a2088eb0 Fix redirect target verification in AsyncUrlSeeder and enhance tests
- Added `verify_redirect_targets` parameter to control redirect verification.
- Modified `_resolve_head()` to verify redirect targets based on the new parameter.
- Implemented tests for both verification modes, ensuring dead redirects are filtered out and legacy behavior is preserved.
2025-11-18 11:43:47 +08:00

29 lines
1.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import pytest
import asyncio
from crawl4ai.async_url_seeder import AsyncUrlSeeder
@pytest.mark.asyncio
async def test_resolve_head_handles_dead_redirects():
seeder = AsyncUrlSeeder()
# Should return None redirects to a dead URL
assert await seeder._resolve_head("http://youtube.com/sitemap.xml") is None
assert await seeder._resolve_head("https://stripe.com/sitemap.xml") is None
@pytest.mark.asyncio
async def test_resolve_head_direct_hit():
seeder = AsyncUrlSeeder()
# Test with a known live URL, e.g., httpbin
result = await seeder._resolve_head("https://httpbin.org/status/200")
assert result == "https://httpbin.org/status/200"
@pytest.mark.asyncio
async def test_resolve_head_verify_redirect_targets_false():
# Test with verification disabled - should return redirect target without checking if alive
seeder = AsyncUrlSeeder(verify_redirect_targets=False)
# This should return the redirect target even if it's dead (old behavior)
result = await seeder._resolve_head("http://youtube.com/sitemap.xml")
# The exact redirect target might vary, but it should not be None
assert result is not None
assert isinstance(result, str)
# Should be different from the input URL (indicating redirect was followed)
assert result != "http://youtube.com/sitemap.xml"