Merge PR #1746: Fix sitemap-only URL seeding avoiding Common Crawl calls

2026-02-01 02:57:06 +00:00
parent 5be0d2d75e 694ba44a04
commit 7c5933e2e7
2 changed files with 37 additions and 5 deletions
--- a/crawl4ai/async_url_seeder.py
+++ b/crawl4ai/async_url_seeder.py
@@ -400,18 +400,20 @@ class AsyncUrlSeeder:
        if self.logger and hasattr(self.logger, 'verbose') and config.verbose is not None:
            self.logger.verbose = config.verbose
        # ensure we have the latest CC collection id
        if self.index_id is None:
            self.index_id = await self._latest_index()
        # Parse source parameter - split by '+' to get list of sources
-        sources = source.split('+')
+        sources = [s.strip().lower() for s in source.split("+") if s.strip()]
        valid_sources = {"cc", "sitemap"}
        for s in sources:
            if s not in valid_sources:
                raise ValueError(
                    f"Invalid source '{s}'. Valid sources are: {', '.join(valid_sources)}")
            # ensure we have the latest CC collection id when the source is cc
            if s == "cc" and self.index_id is None:
                self.index_id = await self._latest_index()
        if hits_per_sec:
            if hits_per_sec <= 0:
                self._log(
--- a/tests/general/test_url_seeder_for_only_sitemap.py
+++ b/tests/general/test_url_seeder_for_only_sitemap.py
@@ -0,0 +1,30 @@
 import asyncio
 from crawl4ai import AsyncLogger, AsyncUrlSeeder, SeedingConfig
 from pathlib import Path
 import httpx
 async def test_sitemap_source_does_not_hit_commoncrawl():
    config = SeedingConfig(
        source="sitemap",
        live_check=False,
        extract_head=False,
        max_urls=50,
        verbose=True,
        force=False
    )
    async with AsyncUrlSeeder(logger=AsyncLogger(verbose=True)) as seeder:
        async def boom(*args, **kwargs):
            print("DEBUG: _latest_index called")
            raise httpx.ConnectTimeout("Simulated CommonCrawl outage")
        seeder._latest_index = boom
        try:
            await seeder.urls("https://docs.crawl4ai.com/", config)
            print("PASS: _latest_index was NOT called (expected after fix).")
        except httpx.ConnectTimeout:
            print("FAIL: _latest_index WAS called even though source='sitemap'.")
 if __name__ == "__main__":
    asyncio.run(test_sitemap_source_does_not_hit_commoncrawl())