Merge PR #1746: Fix sitemap-only URL seeding avoiding Common Crawl calls

This commit is contained in:
unclecode
2026-02-01 02:57:06 +00:00
2 changed files with 37 additions and 5 deletions

View File

@@ -400,18 +400,20 @@ class AsyncUrlSeeder:
if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
self.logger.verbose = config.verbose
# ensure we have the latest CC collection id
if self.index_id is None:
self.index_id = await self._latest_index()
# Parse source parameter - split by '+' to get list of sources
sources = source.split('+')
sources = [s.strip().lower() for s in source.split("+") if s.strip()]
valid_sources = {"cc", "sitemap"}
for s in sources:
if s not in valid_sources:
raise ValueError(
f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
# ensure we have the latest CC collection id when the source is cc
if s == "cc" and self.index_id is None:
self.index_id = await self._latest_index()
if hits_per_sec:
if hits_per_sec <= 0:
self._log(

View File

@@ -0,0 +1,30 @@
import asyncio
from crawl4ai import AsyncLogger, AsyncUrlSeeder, SeedingConfig
from pathlib import Path
import httpx
async def test_sitemap_source_does_not_hit_commoncrawl():
config = SeedingConfig(
source="sitemap",
live_check=False,
extract_head=False,
max_urls=50,
verbose=True,
force=False
)
async with AsyncUrlSeeder(logger=AsyncLogger(verbose=True)) as seeder:
async def boom(*args, **kwargs):
print("DEBUG: _latest_index called")
raise httpx.ConnectTimeout("Simulated CommonCrawl outage")
seeder._latest_index = boom
try:
await seeder.urls("https://docs.crawl4ai.com/", config)
print("PASS: _latest_index was NOT called (expected after fix).")
except httpx.ConnectTimeout:
print("FAIL: _latest_index WAS called even though source='sitemap'.")
if __name__ == "__main__":
asyncio.run(test_sitemap_source_does_not_hit_commoncrawl())