Files
crawl4ai/tests/general/test_url_seeder_for_only_sitemap.py

31 lines
998 B
Python

import asyncio
from crawl4ai import AsyncLogger, AsyncUrlSeeder, SeedingConfig
from pathlib import Path
import httpx
async def test_sitemap_source_does_not_hit_commoncrawl():
config = SeedingConfig(
source="sitemap",
live_check=False,
extract_head=False,
max_urls=50,
verbose=True,
force=False
)
async with AsyncUrlSeeder(logger=AsyncLogger(verbose=True)) as seeder:
async def boom(*args, **kwargs):
print("DEBUG: _latest_index called")
raise httpx.ConnectTimeout("Simulated CommonCrawl outage")
seeder._latest_index = boom
try:
await seeder.urls("https://docs.crawl4ai.com/", config)
print("PASS: _latest_index was NOT called (expected after fix).")
except httpx.ConnectTimeout:
print("FAIL: _latest_index WAS called even though source='sitemap'.")
if __name__ == "__main__":
asyncio.run(test_sitemap_source_does_not_hit_commoncrawl())